diff --git a/.clang-format b/.clang-format index 806b4f010..5c043b486 100644 --- a/.clang-format +++ b/.clang-format @@ -33,7 +33,7 @@ IncludeCategories: IndentExternBlock: NoIndent IndentCaseLabels: true -IndentPPDirectives: BeforeHash +#IndentPPDirectives: None IndentAccessModifiers: false AccessModifierOffset: -4 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index df7fd9d26..8f09562dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -53,8 +53,8 @@ jobs: - name: Configure CMake # some problem with simde - # run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBOX2D_SAMPLES=OFF -DBOX2D_SANITIZE=ON -DBUILD_SHARED_LIBS=OFF - run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBOX2D_SAMPLES=OFF -DBUILD_SHARED_LIBS=OFF + run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBOX2D_SAMPLES=OFF -DBOX2D_SANITIZE=ON -DBUILD_SHARED_LIBS=OFF + # run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBOX2D_SAMPLES=OFF -DBUILD_SHARED_LIBS=OFF - name: Build run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c1390e7c..a909bab96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,9 @@ cmake_minimum_required(VERSION 3.22) include(FetchContent) +include(CMakeDependentOption) project(box2d - VERSION 3.0.0 + VERSION 3.0.1 DESCRIPTION "A 2D physics engine for games" HOMEPAGE_URL "https://box2d.org" LANGUAGES C CXX @@ -34,10 +35,13 @@ if (MSVC OR APPLE) endif() endif() +option(BOX2D_ENABLE_SIMD "Enable SIMD math (faster)" ON) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") - option(BOX2D_AVX2 "Enable AVX2 (faster)" ON) + cmake_dependent_option(BOX2D_AVX2 "Enable AVX2" OFF "BOX2D_ENABLE_SIMD" OFF) endif() + if(PROJECT_IS_TOP_LEVEL) # Needed for samples.exe to find box2d.dll # set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin") @@ -52,8 +56,6 @@ set(CMAKE_COMPILE_WARNING_AS_ERROR ON) set_property(GLOBAL PROPERTY USE_FOLDERS ON) set(CMAKE_VERBOSE_MAKEFILE ON) -# The Box2D library uses simde https://github.com/simd-everywhere/simde -add_subdirectory(extern/simde) add_subdirectory(src) # This hides samples, test, and doxygen from apps that use box2d via FetchContent @@ -95,6 +97,7 @@ if(PROJECT_IS_TOP_LEVEL) if(NOT BUILD_SHARED_LIBS AND BOX2D_UNIT_TESTS) message(STATUS "Adding Box2D unit tests") add_subdirectory(test) + set_target_properties(test PROPERTIES XCODE_GENERATE_SCHEME TRUE) else() message(STATUS "Skipping Box2D unit tests") endif() @@ -107,10 +110,15 @@ if(PROJECT_IS_TOP_LEVEL) set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT samples) set_property(TARGET samples PROPERTY VS_DEBUGGER_WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}") endif() + + set_target_properties(samples PROPERTIES + XCODE_GENERATE_SCHEME TRUE + XCODE_SCHEME_WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}") endif() if(BOX2D_BENCHMARKS) add_subdirectory(benchmark) + set_target_properties(benchmark PROPERTIES XCODE_GENERATE_SCHEME TRUE) endif() if(BOX2D_DOCS) diff --git a/README.md b/README.md index 903a950c6..685ddd919 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ Box2D is a 2D physics engine for games. - cmake -G Xcode .. - open box2d.xcodeproj - Select the samples scheme -- Edit the scheme to set a custom working directory, make this be in box2d/samples +- Edit the scheme to set a custom working directory to the box2d directory - You can now build and run the samples ## Compatibility diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index ff506a6ce..ab457e870 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -19,4 +19,4 @@ if(MSVC) # target_compile_options(benchmark PRIVATE /experimental:c11atomics) endif() -target_link_libraries(benchmark PRIVATE box2d enkiTS simde) +target_link_libraries(benchmark PRIVATE box2d enkiTS) diff --git a/benchmark/amd7950x/joint_grid.csv b/benchmark/amd7950x/joint_grid.csv index 23169e438..5f7e3734c 100644 --- a/benchmark/amd7950x/joint_grid.csv +++ b/benchmark/amd7950x/joint_grid.csv @@ -1,9 +1,9 @@ threads,fps -1,331.343 -2,638.04 -3,932.731 -4,1200.15 -5,1480.23 -6,1718.79 -7,1930.12 -8,2133.65 +1,333.121 +2,638.057 +3,928.95 +4,1205.85 +5,1479.54 +6,1699.99 +7,1974.84 +8,2043.64 diff --git a/benchmark/amd7950x/large_pyramid.csv b/benchmark/amd7950x/large_pyramid.csv index cec711076..dc4ea5b3b 100644 --- a/benchmark/amd7950x/large_pyramid.csv +++ b/benchmark/amd7950x/large_pyramid.csv @@ -1,9 +1,9 @@ threads,fps -1,325.705 -2,616.127 -3,886.575 -4,1118.85 -5,1331.22 -6,1498.6 -7,1685.28 -8,1728.1 +1,336.895 +2,602.665 +3,878.207 +4,1117.02 +5,1304.79 +6,1482.92 +7,1663.97 +8,1661.13 diff --git a/benchmark/amd7950x/many_pyramids.csv b/benchmark/amd7950x/many_pyramids.csv index f56a4281c..a297404f3 100644 --- a/benchmark/amd7950x/many_pyramids.csv +++ b/benchmark/amd7950x/many_pyramids.csv @@ -1,9 +1,9 @@ threads,fps -1,82.8619 -2,160.906 -3,236.027 -4,300.688 -5,368.315 -6,429.822 -7,498.81 -8,549.271 +1,84.8025 +2,163.264 +3,234.388 +4,305.216 +5,369.85 +6,434.45 +7,497.573 +8,525.427 diff --git a/benchmark/amd7950x/smash.csv b/benchmark/amd7950x/smash.csv index 9c2fd141a..2d5e645e8 100644 --- a/benchmark/amd7950x/smash.csv +++ b/benchmark/amd7950x/smash.csv @@ -1,9 +1,9 @@ threads,fps -1,173.898 -2,277.19 -3,357.566 -4,430.528 -5,483.446 -6,525.652 -7,566.859 -8,598.553 +1,174.051 +2,276.742 +3,352.751 +4,421.773 +5,479.049 +6,522.318 +7,556.193 +8,586.672 diff --git a/benchmark/amd7950x/tumbler.csv b/benchmark/amd7950x/tumbler.csv index 167b662dd..22784b027 100644 --- a/benchmark/amd7950x/tumbler.csv +++ b/benchmark/amd7950x/tumbler.csv @@ -1,9 +1,9 @@ threads,fps -1,373.066 -2,581.852 -3,764.444 -4,902.898 -5,1044.99 -6,1143.44 -7,1229.87 -8,1299.61 +1,376.3 +2,576.749 +3,737.749 +4,883.315 +5,1024.69 +6,1120.48 +7,1197.2 +8,1212.85 diff --git a/benchmark/amd7950x_float/joint_grid.csv b/benchmark/amd7950x_float/joint_grid.csv new file mode 100644 index 000000000..7810eb320 --- /dev/null +++ b/benchmark/amd7950x_float/joint_grid.csv @@ -0,0 +1,9 @@ +threads,fps +1,362.12 +2,685.873 +3,998.169 +4,1274.09 +5,1590.5 +6,1841.48 +7,2036.83 +8,2152.76 diff --git a/benchmark/amd7950x_float/large_pyramid.csv b/benchmark/amd7950x_float/large_pyramid.csv new file mode 100644 index 000000000..d9ca9f810 --- /dev/null +++ b/benchmark/amd7950x_float/large_pyramid.csv @@ -0,0 +1,9 @@ +threads,fps +1,148.238 +2,279.403 +3,407.797 +4,524.174 +5,635.423 +6,716.434 +7,799.394 +8,880.242 diff --git a/benchmark/amd7950x_float/many_pyramids.csv b/benchmark/amd7950x_float/many_pyramids.csv new file mode 100644 index 000000000..d55cd46b8 --- /dev/null +++ b/benchmark/amd7950x_float/many_pyramids.csv @@ -0,0 +1,9 @@ +threads,fps +1,38.1845 +2,73.9263 +3,108.337 +4,139.456 +5,171.725 +6,198.861 +7,229.515 +8,253.222 diff --git a/benchmark/amd7950x_float/smash.csv b/benchmark/amd7950x_float/smash.csv new file mode 100644 index 000000000..6dc232869 --- /dev/null +++ b/benchmark/amd7950x_float/smash.csv @@ -0,0 +1,9 @@ +threads,fps +1,130.637 +2,210.938 +3,275.828 +4,341.204 +5,386.281 +6,426.426 +7,452.909 +8,467.611 diff --git a/benchmark/amd7950x_float/tumbler.csv b/benchmark/amd7950x_float/tumbler.csv new file mode 100644 index 000000000..f4e727728 --- /dev/null +++ b/benchmark/amd7950x_float/tumbler.csv @@ -0,0 +1,9 @@ +threads,fps +1,247.067 +2,403.606 +3,523.705 +4,629.426 +5,734.511 +6,800.338 +7,857.235 +8,898.919 diff --git a/benchmark/amd7950x_sse2/joint_grid.csv b/benchmark/amd7950x_sse2/joint_grid.csv index 6c58c02a2..022ee31c3 100644 --- a/benchmark/amd7950x_sse2/joint_grid.csv +++ b/benchmark/amd7950x_sse2/joint_grid.csv @@ -1,9 +1,9 @@ threads,fps -1,357.551 -2,691.193 -3,1010.45 -4,1317.42 -5,1590.65 -6,1858.78 -7,2074.2 -8,2261.67 +1,360.077 +2,687.48 +3,998.479 +4,1261.45 +5,1581.53 +6,1825.64 +7,2067.46 +8,2216.48 diff --git a/benchmark/amd7950x_sse2/large_pyramid.csv b/benchmark/amd7950x_sse2/large_pyramid.csv index a62e7c909..2b295d7d4 100644 --- a/benchmark/amd7950x_sse2/large_pyramid.csv +++ b/benchmark/amd7950x_sse2/large_pyramid.csv @@ -1,9 +1,9 @@ threads,fps -1,186.185 -2,351.045 -3,511.316 -4,636.035 -5,765.404 -6,875.296 -7,991.353 -8,961.402 +1,288.876 +2,527.399 +3,769.81 +4,982.428 +5,1151.91 +6,1323.49 +7,1474.09 +8,1552.6 diff --git a/benchmark/amd7950x_sse2/many_pyramids.csv b/benchmark/amd7950x_sse2/many_pyramids.csv index 57ed916d6..ae10f8511 100644 --- a/benchmark/amd7950x_sse2/many_pyramids.csv +++ b/benchmark/amd7950x_sse2/many_pyramids.csv @@ -1,9 +1,9 @@ threads,fps -1,48.5561 -2,92.6231 -3,137.175 -4,176.644 -5,214.941 -6,253.39 -7,288.631 -8,312.527 +1,75.3333 +2,141.977 +3,205.225 +4,266.523 +5,330.244 +6,380.809 +7,433.287 +8,482.241 diff --git a/benchmark/amd7950x_sse2/smash.csv b/benchmark/amd7950x_sse2/smash.csv index 9763183a0..db60d7201 100644 --- a/benchmark/amd7950x_sse2/smash.csv +++ b/benchmark/amd7950x_sse2/smash.csv @@ -1,9 +1,9 @@ threads,fps -1,142.532 -2,228.987 -3,299.951 -4,364.679 -5,413.564 -6,453.351 -7,489.239 -8,519.379 +1,165.538 +2,263.517 +3,338.066 +4,405.629 +5,461.45 +6,506.119 +7,540.182 +8,563.682 diff --git a/benchmark/amd7950x_sse2/tumbler.csv b/benchmark/amd7950x_sse2/tumbler.csv index 82d5aaaeb..456233ed5 100644 --- a/benchmark/amd7950x_sse2/tumbler.csv +++ b/benchmark/amd7950x_sse2/tumbler.csv @@ -1,9 +1,9 @@ threads,fps -1,276.905 -2,453.522 -3,592.946 -4,702.383 -5,826.52 -6,919.179 -7,1009.05 -8,1062.61 +1,326.657 +2,521.743 +3,671.396 +4,805.81 +5,928.274 +6,1019.45 +7,1082.76 +8,1109.95 diff --git a/benchmark/benchmark_results.html b/benchmark/benchmark_results.html index 5a0f3ebc7..0d1c44a41 100644 --- a/benchmark/benchmark_results.html +++ b/benchmark/benchmark_results.html @@ -37,7 +37,7 @@ } .chart-item { width: 100%; - height: 500px; /* Fixed height for each chart */ + height: 700px; /* Fixed height for each chart */ } canvas { background-color: #1e1e1e; @@ -110,7 +110,26 @@

Box2D Benchmarks

} ]; - const processors = ['amd7950x', 'm2air']; + const processors = [ + { + folder: 'amd7950x', + label: 'avx2' + }, + { + folder: 'm2air', + label: 'neon' + }, + { + folder: 'amd7950x_sse2', + label: 'sse2', + }, + { + + folder: 'amd7950x_float', + label: 'float' + } + ]; + const branchSelect = document.getElementById('branchSelect'); const loadButton = document.getElementById('loadButton'); var currentBranch = null; @@ -121,7 +140,7 @@

Box2D Benchmarks

} async function fetchBranches() { - const url = `https://api.github.com/repos/erincatto/box2c/branches`; + const url = `https://api.github.com/repos/erincatto/box2d/branches`; const response = await fetch(url); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); @@ -167,7 +186,7 @@

Box2D Benchmarks

{ for (const processor of processors) { - const csvUrl = repoBaseUrl + '/' + branch + '/benchmark/' + processor + '/' + file; + const csvUrl = repoBaseUrl + '/' + branch + '/benchmark/' + processor.folder + '/' + file; try { @@ -177,13 +196,31 @@

Box2D Benchmarks

if (data != null) { - datasets.push({ - label: processor + '-' + branch, + if (branch == 'main') + { + datasets.push({ + label: processor.label, + + // convert the csv data into chart ready data + data: data.map(row => ({ x: row.threads, y: row.fps })), + fill: false + }); + } + else + { + // remove leading name from branch name (e.g. erincatto) + const branchLabel = branch.split('/').pop(); - // convert the csv data into chart ready data - data: data.map(row => ({ x: row.threads, y: row.fps })), - fill: false - }); + datasets.push({ + label: processor.label + '/' + branchLabel, + + // convert the csv data into chart ready data + data: data.map(row => ({ x: row.threads, y: row.fps })), + + borderDash: [5, 5], + fill: false + }); + } } } catch (error) { diff --git a/benchmark/m2air/joint_grid.csv b/benchmark/m2air/joint_grid.csv index 9e9d17a86..bde92d859 100644 --- a/benchmark/m2air/joint_grid.csv +++ b/benchmark/m2air/joint_grid.csv @@ -1,5 +1,5 @@ threads,fps -1,510.67 -2,955.752 -3,1384.14 -4,1651.69 +1,515.583 +2,968.592 +3,1393.95 +4,1652.07 diff --git a/benchmark/m2air/large_pyramid.csv b/benchmark/m2air/large_pyramid.csv index 60f80a59a..3517e4263 100644 --- a/benchmark/m2air/large_pyramid.csv +++ b/benchmark/m2air/large_pyramid.csv @@ -1,5 +1,5 @@ threads,fps -1,284.342 -2,526.955 -3,728.772 -4,911.715 +1,327.475 +2,606.037 +3,843.779 +4,1057.89 diff --git a/benchmark/m2air/many_pyramids.csv b/benchmark/m2air/many_pyramids.csv index 757746687..1edc5bc6a 100644 --- a/benchmark/m2air/many_pyramids.csv +++ b/benchmark/m2air/many_pyramids.csv @@ -1,5 +1,5 @@ threads,fps -1,73.9053 -2,139.551 -3,193.414 -4,234.215 +1,85.4238 +2,159.788 +3,219.475 +4,261.439 diff --git a/benchmark/m2air/smash.csv b/benchmark/m2air/smash.csv index fdc6057eb..8bed7f253 100644 --- a/benchmark/m2air/smash.csv +++ b/benchmark/m2air/smash.csv @@ -1,5 +1,5 @@ threads,fps -1,161.17 -2,256.422 -3,321.137 -4,385.797 +1,166.289 +2,262.228 +3,329.267 +4,393.513 diff --git a/benchmark/m2air/tumbler.csv b/benchmark/m2air/tumbler.csv index ca11db241..e926a1e57 100644 --- a/benchmark/m2air/tumbler.csv +++ b/benchmark/m2air/tumbler.csv @@ -1,5 +1,5 @@ threads,fps -1,340.404 -2,538.587 -3,661.721 -4,781.784 +1,351.845 +2,549.496 +3,674.889 +4,801.015 diff --git a/docs/FAQ.md b/docs/FAQ.md index 12b2680e2..c842b800e 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -1,7 +1,7 @@ # FAQ ## What is Box2D? -Box2D is a feature rich 2D rigid body physics engine, written in C11 by Erin Catto. It has been used in many games and in many +Box2D is a feature rich 2D rigid body physics engine, written in C17 by Erin Catto. It has been used in many games and in many game engines. Box2D uses the [MIT license](https://en.wikipedia.org/wiki/MIT_License) license and can be used free of charge. Credit @@ -9,13 +9,13 @@ should be included if possible. Support is [appreciated](https://github.com/spon Box2D [logo](https://box2d.org/images/logo.svg). ## What platforms does Box2D support? -Box2D is developed using C11. Ports and bindings are likely available for most languages and platforms. +Box2D is developed using C17. Ports and bindings are likely available for most languages and platforms. -Erin Catto maintains the C11 version, but provides no support for other languages. Other languages are supported +Erin Catto maintains the C17 version, but provides no support for other languages. Other languages are supported by the community and possibly by the authors of those ports. ## Who makes it? -Erin Catto is the creator and sole contributor of the C11 version of Box2D, with various others supporting the ports. Box2D is an open source project, and accepts community feedback. +Erin Catto is the creator and sole contributor of the C17 version of Box2D, with various others supporting the ports. Box2D is an open source project, and accepts community feedback. ## How do I get help? You should read the documentation and the rest of this FAQ first. Also, you should study the examples included in the source distribution. Then you can visit the [Discord](https://discord.gg/aM4mRKxW) to ask any remaining questions. @@ -49,7 +49,7 @@ Now the only tricky part is choosing a scaling factor. This really depends on yo This [repo](https://github.com/erincatto/box2d-raylib) shows how to convert meters to pixels. ### Why don't you use this awesome language? -Box2D is designed to be portable and easy to wrap with other languages, so I decided to use C11. I used C11 to get support for atomics. +Box2D is designed to be portable and easy to wrap with other languages, so I decided to use C17. I used C17 to get support for atomics. ### Can I use Box2D in a DLL? Yes. See the CMake option `BUILD_SHARED_LIBS`. diff --git a/extern/simde/CMakeLists.txt b/extern/simde/CMakeLists.txt deleted file mode 100644 index a65f35953..000000000 --- a/extern/simde/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -# This is a reduced version of https://github.com/simd-everywhere/simde - -add_library( - simde INTERFACE - check.h - debug-trap.h - hedley.h - simde-aes.h - simde-align.h - simde-arch.h - simde-common.h - simde-constify.h - simde-detect-clang.h - simde-diagnostic.h - simde-f16.h - simde-features.h - simde-math.h - x86/aes.h - x86/avx.h - x86/avx2.h - x86/f16c.h - x86/fma.h - x86/mmx.h - x86/sse.h - x86/sse2.h - x86/sse3.h - x86/sse4.1.h - x86/sse4.2.h - x86/ssse3.h -) - -target_include_directories(simde INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/extern/simde/check.h b/extern/simde/check.h deleted file mode 100644 index 7d17d2925..000000000 --- a/extern/simde/check.h +++ /dev/null @@ -1,276 +0,0 @@ -/* Check (assertions) - * Portable Snippets - https://github.com/nemequ/portable-snippets - * Created by Evan Nemerson - * - * To the extent possible under law, the authors have waived all - * copyright and related or neighboring rights to this code. For - * details, see the Creative Commons Zero 1.0 Universal license at - * https://creativecommons.org/publicdomain/zero/1.0/ - * - * SPDX-License-Identifier: CC0-1.0 - */ - -#if !defined(SIMDE_CHECK_H) -#define SIMDE_CHECK_H - -#if !defined(SIMDE_NDEBUG) && !defined(SIMDE_DEBUG) -# define SIMDE_NDEBUG 1 -#endif - -#include "hedley.h" -#include "simde-diagnostic.h" -#include - -#if !defined(_WIN32) -# define SIMDE_SIZE_MODIFIER "z" -# define SIMDE_CHAR_MODIFIER "hh" -# define SIMDE_SHORT_MODIFIER "h" -#else -# if defined(_M_X64) || defined(__amd64__) -# define SIMDE_SIZE_MODIFIER "I64" -# else -# define SIMDE_SIZE_MODIFIER "" -# endif -# define SIMDE_CHAR_MODIFIER "" -# define SIMDE_SHORT_MODIFIER "" -#endif - -#if defined(_MSC_VER) && (_MSC_VER >= 1500) -# define SIMDE_PUSH_DISABLE_MSVC_C4127_ __pragma(warning(push)) __pragma(warning(disable:4127)) -# define SIMDE_POP_DISABLE_MSVC_C4127_ __pragma(warning(pop)) -#else -# define SIMDE_PUSH_DISABLE_MSVC_C4127_ -# define SIMDE_POP_DISABLE_MSVC_C4127_ -#endif - -#if !defined(simde_errorf) -# if defined(__has_include) -# if __has_include() -# include -# endif -# elif defined(SIMDE_STDC_HOSTED) -# if SIMDE_STDC_HOSTED == 1 -# include -# endif -# elif defined(__STDC_HOSTED__) -# if __STDC_HOSTETD__ == 1 -# include -# endif -# endif - -# include "debug-trap.h" - - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ -# if defined(EOF) -# define simde_errorf(format, ...) (fprintf(stderr, format, __VA_ARGS__), abort()) -# else -# define simde_errorf(format, ...) (simde_trap()) -# endif - HEDLEY_DIAGNOSTIC_POP -#endif - -#define simde_error(msg) simde_errorf("%s", msg) - -#if defined(SIMDE_NDEBUG) || \ - (defined(__cplusplus) && (__cplusplus < 201103L)) || \ - (defined(__STDC__) && (__STDC__ < 199901L)) -# if defined(SIMDE_CHECK_FAIL_DEFINED) -# define simde_assert(expr) -# else -# if defined(HEDLEY_ASSUME) -# define simde_assert(expr) HEDLEY_ASSUME(expr) -# elif HEDLEY_GCC_VERSION_CHECK(4,5,0) -# define simde_assert(expr) ((void) (!!(expr) ? 1 : (__builtin_unreachable(), 1))) -# elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) -# define simde_assert(expr) __assume(expr) -# else -# define simde_assert(expr) -# endif -# endif -# define simde_assert_true(expr) simde_assert(expr) -# define simde_assert_false(expr) simde_assert(!(expr)) -# define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) simde_assert(((a) op (b))) -# define simde_assert_double_equal(a, b, precision) -# define simde_assert_string_equal(a, b) -# define simde_assert_string_not_equal(a, b) -# define simde_assert_memory_equal(size, a, b) -# define simde_assert_memory_not_equal(size, a, b) -#else -# define simde_assert(expr) \ - do { \ - if (!HEDLEY_LIKELY(expr)) { \ - simde_error("assertion failed: " #expr "\n"); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_true(expr) \ - do { \ - if (!HEDLEY_LIKELY(expr)) { \ - simde_error("assertion failed: " #expr " is not true\n"); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_false(expr) \ - do { \ - if (!HEDLEY_LIKELY(!(expr))) { \ - simde_error("assertion failed: " #expr " is not false\n"); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ - do { \ - T simde_tmp_a_ = (a); \ - T simde_tmp_b_ = (b); \ - if (!(simde_tmp_a_ op simde_tmp_b_)) { \ - simde_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")\n", \ - #a, #op, #b, simde_tmp_a_, #op, simde_tmp_b_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_double_equal(a, b, precision) \ - do { \ - const double simde_tmp_a_ = (a); \ - const double simde_tmp_b_ = (b); \ - const double simde_tmp_diff_ = ((simde_tmp_a_ - simde_tmp_b_) < 0) ? \ - -(simde_tmp_a_ - simde_tmp_b_) : \ - (simde_tmp_a_ - simde_tmp_b_); \ - if (HEDLEY_UNLIKELY(simde_tmp_diff_ > 1e-##precision)) { \ - simde_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)\n", \ - #a, #b, simde_tmp_a_, simde_tmp_b_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# include -# define simde_assert_string_equal(a, b) \ - do { \ - const char* simde_tmp_a_ = a; \ - const char* simde_tmp_b_ = b; \ - if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) != 0)) { \ - simde_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \ - #a, #b, simde_tmp_a_, simde_tmp_b_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_string_not_equal(a, b) \ - do { \ - const char* simde_tmp_a_ = a; \ - const char* simde_tmp_b_ = b; \ - if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) == 0)) { \ - simde_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \ - #a, #b, simde_tmp_a_, simde_tmp_b_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_memory_equal(size, a, b) \ - do { \ - const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \ - const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \ - const size_t simde_tmp_size_ = (size); \ - if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) != 0) { \ - size_t simde_tmp_pos_; \ - for (simde_tmp_pos_ = 0 ; simde_tmp_pos_ < simde_tmp_size_ ; simde_tmp_pos_++) { \ - if (simde_tmp_a_[simde_tmp_pos_] != simde_tmp_b_[simde_tmp_pos_]) { \ - simde_errorf("assertion failed: memory %s == %s, at offset %" SIMDE_SIZE_MODIFIER "u\n", \ - #a, #b, simde_tmp_pos_); \ - break; \ - } \ - } \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_memory_not_equal(size, a, b) \ - do { \ - const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \ - const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \ - const size_t simde_tmp_size_ = (size); \ - if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) == 0) { \ - simde_errorf("assertion failed: memory %s != %s (%" SIMDE_SIZE_MODIFIER "u bytes)\n", \ - #a, #b, simde_tmp_size_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ -#endif - -#define simde_assert_type(T, fmt, a, op, b) \ - simde_assert_type_full("", "", T, fmt, a, op, b) - -#define simde_assert_char(a, op, b) \ - simde_assert_type_full("'\\x", "'", char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b) -#define simde_assert_uchar(a, op, b) \ - simde_assert_type_full("'\\x", "'", unsigned char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b) -#define simde_assert_short(a, op, b) \ - simde_assert_type(short, SIMDE_SHORT_MODIFIER "d", a, op, b) -#define simde_assert_ushort(a, op, b) \ - simde_assert_type(unsigned short, SIMDE_SHORT_MODIFIER "u", a, op, b) -#define simde_assert_int(a, op, b) \ - simde_assert_type(int, "d", a, op, b) -#define simde_assert_uint(a, op, b) \ - simde_assert_type(unsigned int, "u", a, op, b) -#define simde_assert_long(a, op, b) \ - simde_assert_type(long int, "ld", a, op, b) -#define simde_assert_ulong(a, op, b) \ - simde_assert_type(unsigned long int, "lu", a, op, b) -#define simde_assert_llong(a, op, b) \ - simde_assert_type(long long int, "lld", a, op, b) -#define simde_assert_ullong(a, op, b) \ - simde_assert_type(unsigned long long int, "llu", a, op, b) - -#define simde_assert_size(a, op, b) \ - simde_assert_type(size_t, SIMDE_SIZE_MODIFIER "u", a, op, b) - -#define simde_assert_float(a, op, b) \ - simde_assert_type(float, "f", a, op, b) -#define simde_assert_double(a, op, b) \ - simde_assert_type(double, "g", a, op, b) -#define simde_assert_ptr(a, op, b) \ - simde_assert_type(const void*, "p", a, op, b) - -#define simde_assert_int8(a, op, b) \ - simde_assert_type(int8_t, PRIi8, a, op, b) -#define simde_assert_uint8(a, op, b) \ - simde_assert_type(uint8_t, PRIu8, a, op, b) -#define simde_assert_int16(a, op, b) \ - simde_assert_type(int16_t, PRIi16, a, op, b) -#define simde_assert_uint16(a, op, b) \ - simde_assert_type(uint16_t, PRIu16, a, op, b) -#define simde_assert_int32(a, op, b) \ - simde_assert_type(int32_t, PRIi32, a, op, b) -#define simde_assert_uint32(a, op, b) \ - simde_assert_type(uint32_t, PRIu32, a, op, b) -#define simde_assert_int64(a, op, b) \ - simde_assert_type(int64_t, PRIi64, a, op, b) -#define simde_assert_uint64(a, op, b) \ - simde_assert_type(uint64_t, PRIu64, a, op, b) - -#define simde_assert_ptr_equal(a, b) \ - simde_assert_ptr(a, ==, b) -#define simde_assert_ptr_not_equal(a, b) \ - simde_assert_ptr(a, !=, b) -#define simde_assert_null(ptr) \ - simde_assert_ptr(ptr, ==, NULL) -#define simde_assert_not_null(ptr) \ - simde_assert_ptr(ptr, !=, NULL) -#define simde_assert_ptr_null(ptr) \ - simde_assert_ptr(ptr, ==, NULL) -#define simde_assert_ptr_not_null(ptr) \ - simde_assert_ptr(ptr, !=, NULL) - -#endif /* !defined(SIMDE_CHECK_H) */ diff --git a/extern/simde/debug-trap.h b/extern/simde/debug-trap.h deleted file mode 100644 index 2d3c60f84..000000000 --- a/extern/simde/debug-trap.h +++ /dev/null @@ -1,85 +0,0 @@ -/* Debugging assertions and traps - * Portable Snippets - https://github.com/nemequ/portable-snippets - * Created by Evan Nemerson - * - * To the extent possible under law, the authors have waived all - * copyright and related or neighboring rights to this code. For - * details, see the Creative Commons Zero 1.0 Universal license at - * https://creativecommons.org/publicdomain/zero/1.0/ - * - * SPDX-License-Identifier: CC0-1.0 - */ - -#if !defined(SIMDE_DEBUG_TRAP_H) -#define SIMDE_DEBUG_TRAP_H - -#if !defined(SIMDE_NDEBUG) && defined(NDEBUG) && !defined(SIMDE_DEBUG) -# define SIMDE_NDEBUG 1 -#endif - -#if defined(__has_builtin) && !defined(__ibmxl__) -# if __has_builtin(__builtin_debugtrap) -# define simde_trap() __builtin_debugtrap() -# elif __has_builtin(__debugbreak) -# define simde_trap() __debugbreak() -# endif -#endif -#if !defined(simde_trap) -# if defined(_MSC_VER) || defined(__INTEL_COMPILER) -# define simde_trap() __debugbreak() -# elif defined(__ARMCC_VERSION) -# define simde_trap() __breakpoint(42) -# elif defined(__ibmxl__) || defined(__xlC__) -# include -# define simde_trap() __trap(42) -# elif defined(__DMC__) && defined(_M_IX86) - static inline void simde_trap(void) { __asm int 3h; } -# elif defined(__i386__) || defined(__x86_64__) - static inline void simde_trap(void) { __asm__ __volatile__("int $03"); } -# elif defined(__thumb__) - static inline void simde_trap(void) { __asm__ __volatile__(".inst 0xde01"); } -# elif defined(__aarch64__) - static inline void simde_trap(void) { __asm__ __volatile__(".inst 0xd4200000"); } -# elif defined(__arm__) - static inline void simde_trap(void) { __asm__ __volatile__(".inst 0xe7f001f0"); } -# elif defined (__alpha__) && !defined(__osf__) - static inline void simde_trap(void) { __asm__ __volatile__("bpt"); } -# elif defined(_54_) - static inline void simde_trap(void) { __asm__ __volatile__("ESTOP"); } -# elif defined(_55_) - static inline void simde_trap(void) { __asm__ __volatile__(";\n .if (.MNEMONIC)\n ESTOP_1\n .else\n ESTOP_1()\n .endif\n NOP"); } -# elif defined(_64P_) - static inline void simde_trap(void) { __asm__ __volatile__("SWBP 0"); } -# elif defined(_6x_) - static inline void simde_trap(void) { __asm__ __volatile__("NOP\n .word 0x10000000"); } -# elif defined(__STDC_HOSTED__) && (__STDC_HOSTED__ == 0) && defined(__GNUC__) -# define simde_trap() __builtin_trap() -# else -# include -# if defined(SIGTRAP) -# define simde_trap() raise(SIGTRAP) -# else -# define simde_trap() raise(SIGABRT) -# endif -# endif -#endif - -#if defined(HEDLEY_LIKELY) -# define SIMDE_DBG_LIKELY(expr) HEDLEY_LIKELY(expr) -#elif defined(__GNUC__) && (__GNUC__ >= 3) -# define SIMDE_DBG_LIKELY(expr) __builtin_expect(!!(expr), 1) -#else -# define SIMDE_DBG_LIKELY(expr) (!!(expr)) -#endif - -#if !defined(SIMDE_NDEBUG) || (SIMDE_NDEBUG == 0) -# define simde_dbg_assert(expr) do { \ - if (!SIMDE_DBG_LIKELY(expr)) { \ - simde_trap(); \ - } \ - } while (0) -#else -# define simde_dbg_assert(expr) -#endif - -#endif /* !defined(SIMDE_DEBUG_TRAP_H) */ diff --git a/extern/simde/hedley.h b/extern/simde/hedley.h deleted file mode 100644 index 41ac30221..000000000 --- a/extern/simde/hedley.h +++ /dev/null @@ -1,2044 +0,0 @@ -/* Hedley - https://nemequ.github.io/hedley - * Created by Evan Nemerson - * - * To the extent possible under law, the author(s) have dedicated all - * copyright and related and neighboring rights to this software to - * the public domain worldwide. This software is distributed without - * any warranty. - * - * For details, see . - * SPDX-License-Identifier: CC0-1.0 - */ - -#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 16) -#if defined(HEDLEY_VERSION) -# undef HEDLEY_VERSION -#endif -#define HEDLEY_VERSION 16 - -#if defined(HEDLEY_STRINGIFY_EX) -# undef HEDLEY_STRINGIFY_EX -#endif -#define HEDLEY_STRINGIFY_EX(x) #x - -#if defined(HEDLEY_STRINGIFY) -# undef HEDLEY_STRINGIFY -#endif -#define HEDLEY_STRINGIFY(x) HEDLEY_STRINGIFY_EX(x) - -#if defined(HEDLEY_CONCAT_EX) -# undef HEDLEY_CONCAT_EX -#endif -#define HEDLEY_CONCAT_EX(a,b) a##b - -#if defined(HEDLEY_CONCAT) -# undef HEDLEY_CONCAT -#endif -#define HEDLEY_CONCAT(a,b) HEDLEY_CONCAT_EX(a,b) - -#if defined(HEDLEY_CONCAT3_EX) -# undef HEDLEY_CONCAT3_EX -#endif -#define HEDLEY_CONCAT3_EX(a,b,c) a##b##c - -#if defined(HEDLEY_CONCAT3) -# undef HEDLEY_CONCAT3 -#endif -#define HEDLEY_CONCAT3(a,b,c) HEDLEY_CONCAT3_EX(a,b,c) - -#if defined(HEDLEY_VERSION_ENCODE) -# undef HEDLEY_VERSION_ENCODE -#endif -#define HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision)) - -#if defined(HEDLEY_VERSION_DECODE_MAJOR) -# undef HEDLEY_VERSION_DECODE_MAJOR -#endif -#define HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000) - -#if defined(HEDLEY_VERSION_DECODE_MINOR) -# undef HEDLEY_VERSION_DECODE_MINOR -#endif -#define HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000) - -#if defined(HEDLEY_VERSION_DECODE_REVISION) -# undef HEDLEY_VERSION_DECODE_REVISION -#endif -#define HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000) - -#if defined(HEDLEY_GNUC_VERSION) -# undef HEDLEY_GNUC_VERSION -#endif -#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__) -# define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) -#elif defined(__GNUC__) -# define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0) -#endif - -#if defined(HEDLEY_GNUC_VERSION_CHECK) -# undef HEDLEY_GNUC_VERSION_CHECK -#endif -#if defined(HEDLEY_GNUC_VERSION) -# define HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (HEDLEY_GNUC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_MSVC_VERSION) -# undef HEDLEY_MSVC_VERSION -#endif -#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL) -# define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100) -#elif defined(_MSC_FULL_VER) && !defined(__ICL) -# define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10) -#elif defined(_MSC_VER) && !defined(__ICL) -# define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0) -#endif - -#if defined(HEDLEY_MSVC_VERSION_CHECK) -# undef HEDLEY_MSVC_VERSION_CHECK -#endif -#if !defined(HEDLEY_MSVC_VERSION) -# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0) -#elif defined(_MSC_VER) && (_MSC_VER >= 1400) -# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch))) -#elif defined(_MSC_VER) && (_MSC_VER >= 1200) -# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch))) -#else -# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor))) -#endif - -#if defined(HEDLEY_INTEL_VERSION) -# undef HEDLEY_INTEL_VERSION -#endif -#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL) -# define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE) -#elif defined(__INTEL_COMPILER) && !defined(__ICL) -# define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0) -#endif - -#if defined(HEDLEY_INTEL_VERSION_CHECK) -# undef HEDLEY_INTEL_VERSION_CHECK -#endif -#if defined(HEDLEY_INTEL_VERSION) -# define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (HEDLEY_INTEL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_INTEL_CL_VERSION) -# undef HEDLEY_INTEL_CL_VERSION -#endif -#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL) -# define HEDLEY_INTEL_CL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0) -#endif - -#if defined(HEDLEY_INTEL_CL_VERSION_CHECK) -# undef HEDLEY_INTEL_CL_VERSION_CHECK -#endif -#if defined(HEDLEY_INTEL_CL_VERSION) -# define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (HEDLEY_INTEL_CL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_PGI_VERSION) -# undef HEDLEY_PGI_VERSION -#endif -#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__) -# define HEDLEY_PGI_VERSION HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__) -#endif - -#if defined(HEDLEY_PGI_VERSION_CHECK) -# undef HEDLEY_PGI_VERSION_CHECK -#endif -#if defined(HEDLEY_PGI_VERSION) -# define HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (HEDLEY_PGI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_SUNPRO_VERSION) -# undef HEDLEY_SUNPRO_VERSION -#endif -#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000) -# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10) -#elif defined(__SUNPRO_C) -# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf) -#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000) -# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10) -#elif defined(__SUNPRO_CC) -# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf) -#endif - -#if defined(HEDLEY_SUNPRO_VERSION_CHECK) -# undef HEDLEY_SUNPRO_VERSION_CHECK -#endif -#if defined(HEDLEY_SUNPRO_VERSION) -# define HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (HEDLEY_SUNPRO_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_EMSCRIPTEN_VERSION) -# undef HEDLEY_EMSCRIPTEN_VERSION -#endif -#if defined(__EMSCRIPTEN__) -# define HEDLEY_EMSCRIPTEN_VERSION HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) -#endif - -#if defined(HEDLEY_EMSCRIPTEN_VERSION_CHECK) -# undef HEDLEY_EMSCRIPTEN_VERSION_CHECK -#endif -#if defined(HEDLEY_EMSCRIPTEN_VERSION) -# define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (HEDLEY_EMSCRIPTEN_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_ARM_VERSION) -# undef HEDLEY_ARM_VERSION -#endif -#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION) -# define HEDLEY_ARM_VERSION HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100) -#elif defined(__CC_ARM) && defined(__ARMCC_VERSION) -# define HEDLEY_ARM_VERSION HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100) -#endif - -#if defined(HEDLEY_ARM_VERSION_CHECK) -# undef HEDLEY_ARM_VERSION_CHECK -#endif -#if defined(HEDLEY_ARM_VERSION) -# define HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (HEDLEY_ARM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_IBM_VERSION) -# undef HEDLEY_IBM_VERSION -#endif -#if defined(__ibmxl__) -# define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__) -#elif defined(__xlC__) && defined(__xlC_ver__) -# define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff) -#elif defined(__xlC__) -# define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0) -#endif - -#if defined(HEDLEY_IBM_VERSION_CHECK) -# undef HEDLEY_IBM_VERSION_CHECK -#endif -#if defined(HEDLEY_IBM_VERSION) -# define HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (HEDLEY_IBM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_VERSION) -# undef HEDLEY_TI_VERSION -#endif -#if \ - defined(__TI_COMPILER_VERSION__) && \ - ( \ - defined(__TMS470__) || defined(__TI_ARM__) || \ - defined(__MSP430__) || \ - defined(__TMS320C2000__) \ - ) -# if (__TI_COMPILER_VERSION__ >= 16000000) -# define HEDLEY_TI_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -# endif -#endif - -#if defined(HEDLEY_TI_VERSION_CHECK) -# undef HEDLEY_TI_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_VERSION) -# define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CL2000_VERSION) -# undef HEDLEY_TI_CL2000_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__) -# define HEDLEY_TI_CL2000_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CL2000_VERSION_CHECK) -# undef HEDLEY_TI_CL2000_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CL2000_VERSION) -# define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL2000_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CL430_VERSION) -# undef HEDLEY_TI_CL430_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__) -# define HEDLEY_TI_CL430_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CL430_VERSION_CHECK) -# undef HEDLEY_TI_CL430_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CL430_VERSION) -# define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL430_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_ARMCL_VERSION) -# undef HEDLEY_TI_ARMCL_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__)) -# define HEDLEY_TI_ARMCL_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_ARMCL_VERSION_CHECK) -# undef HEDLEY_TI_ARMCL_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_ARMCL_VERSION) -# define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_ARMCL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CL6X_VERSION) -# undef HEDLEY_TI_CL6X_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__) -# define HEDLEY_TI_CL6X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CL6X_VERSION_CHECK) -# undef HEDLEY_TI_CL6X_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CL6X_VERSION) -# define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL6X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CL7X_VERSION) -# undef HEDLEY_TI_CL7X_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__) -# define HEDLEY_TI_CL7X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CL7X_VERSION_CHECK) -# undef HEDLEY_TI_CL7X_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CL7X_VERSION) -# define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL7X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CLPRU_VERSION) -# undef HEDLEY_TI_CLPRU_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__) -# define HEDLEY_TI_CLPRU_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CLPRU_VERSION_CHECK) -# undef HEDLEY_TI_CLPRU_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CLPRU_VERSION) -# define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CLPRU_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_CRAY_VERSION) -# undef HEDLEY_CRAY_VERSION -#endif -#if defined(_CRAYC) -# if defined(_RELEASE_PATCHLEVEL) -# define HEDLEY_CRAY_VERSION HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL) -# else -# define HEDLEY_CRAY_VERSION HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0) -# endif -#endif - -#if defined(HEDLEY_CRAY_VERSION_CHECK) -# undef HEDLEY_CRAY_VERSION_CHECK -#endif -#if defined(HEDLEY_CRAY_VERSION) -# define HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (HEDLEY_CRAY_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_IAR_VERSION) -# undef HEDLEY_IAR_VERSION -#endif -#if defined(__IAR_SYSTEMS_ICC__) -# if __VER__ > 1000 -# define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000)) -# else -# define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0) -# endif -#endif - -#if defined(HEDLEY_IAR_VERSION_CHECK) -# undef HEDLEY_IAR_VERSION_CHECK -#endif -#if defined(HEDLEY_IAR_VERSION) -# define HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (HEDLEY_IAR_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TINYC_VERSION) -# undef HEDLEY_TINYC_VERSION -#endif -#if defined(__TINYC__) -# define HEDLEY_TINYC_VERSION HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100) -#endif - -#if defined(HEDLEY_TINYC_VERSION_CHECK) -# undef HEDLEY_TINYC_VERSION_CHECK -#endif -#if defined(HEDLEY_TINYC_VERSION) -# define HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (HEDLEY_TINYC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_DMC_VERSION) -# undef HEDLEY_DMC_VERSION -#endif -#if defined(__DMC__) -# define HEDLEY_DMC_VERSION HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf) -#endif - -#if defined(HEDLEY_DMC_VERSION_CHECK) -# undef HEDLEY_DMC_VERSION_CHECK -#endif -#if defined(HEDLEY_DMC_VERSION) -# define HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (HEDLEY_DMC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_COMPCERT_VERSION) -# undef HEDLEY_COMPCERT_VERSION -#endif -#if defined(__COMPCERT_VERSION__) -# define HEDLEY_COMPCERT_VERSION HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100) -#endif - -#if defined(HEDLEY_COMPCERT_VERSION_CHECK) -# undef HEDLEY_COMPCERT_VERSION_CHECK -#endif -#if defined(HEDLEY_COMPCERT_VERSION) -# define HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (HEDLEY_COMPCERT_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_PELLES_VERSION) -# undef HEDLEY_PELLES_VERSION -#endif -#if defined(__POCC__) -# define HEDLEY_PELLES_VERSION HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0) -#endif - -#if defined(HEDLEY_PELLES_VERSION_CHECK) -# undef HEDLEY_PELLES_VERSION_CHECK -#endif -#if defined(HEDLEY_PELLES_VERSION) -# define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (HEDLEY_PELLES_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_MCST_LCC_VERSION) -# undef HEDLEY_MCST_LCC_VERSION -#endif -#if defined(__LCC__) && defined(__LCC_MINOR__) -# define HEDLEY_MCST_LCC_VERSION HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__) -#endif - -#if defined(HEDLEY_MCST_LCC_VERSION_CHECK) -# undef HEDLEY_MCST_LCC_VERSION_CHECK -#endif -#if defined(HEDLEY_MCST_LCC_VERSION) -# define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (HEDLEY_MCST_LCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_GCC_VERSION) -# undef HEDLEY_GCC_VERSION -#endif -#if \ - defined(HEDLEY_GNUC_VERSION) && \ - !defined(__clang__) && \ - !defined(HEDLEY_INTEL_VERSION) && \ - !defined(HEDLEY_PGI_VERSION) && \ - !defined(HEDLEY_ARM_VERSION) && \ - !defined(HEDLEY_CRAY_VERSION) && \ - !defined(HEDLEY_TI_VERSION) && \ - !defined(HEDLEY_TI_ARMCL_VERSION) && \ - !defined(HEDLEY_TI_CL430_VERSION) && \ - !defined(HEDLEY_TI_CL2000_VERSION) && \ - !defined(HEDLEY_TI_CL6X_VERSION) && \ - !defined(HEDLEY_TI_CL7X_VERSION) && \ - !defined(HEDLEY_TI_CLPRU_VERSION) && \ - !defined(__COMPCERT__) && \ - !defined(HEDLEY_MCST_LCC_VERSION) -# define HEDLEY_GCC_VERSION HEDLEY_GNUC_VERSION -#endif - -#if defined(HEDLEY_GCC_VERSION_CHECK) -# undef HEDLEY_GCC_VERSION_CHECK -#endif -#if defined(HEDLEY_GCC_VERSION) -# define HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (HEDLEY_GCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_HAS_ATTRIBUTE) -# undef HEDLEY_HAS_ATTRIBUTE -#endif -#if \ - defined(__has_attribute) && \ - ( \ - (!defined(HEDLEY_IAR_VERSION) || HEDLEY_IAR_VERSION_CHECK(8,5,9)) \ - ) -# define HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute) -#else -# define HEDLEY_HAS_ATTRIBUTE(attribute) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_ATTRIBUTE) -# undef HEDLEY_GNUC_HAS_ATTRIBUTE -#endif -#if defined(__has_attribute) -# define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute) -#else -# define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_ATTRIBUTE) -# undef HEDLEY_GCC_HAS_ATTRIBUTE -#endif -#if defined(__has_attribute) -# define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute) -#else -# define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_CPP_ATTRIBUTE) -# undef HEDLEY_HAS_CPP_ATTRIBUTE -#endif -#if \ - defined(__has_cpp_attribute) && \ - defined(__cplusplus) && \ - (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) -# define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute) -#else -# define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0) -#endif - -#if defined(HEDLEY_HAS_CPP_ATTRIBUTE_NS) -# undef HEDLEY_HAS_CPP_ATTRIBUTE_NS -#endif -#if !defined(__cplusplus) || !defined(__has_cpp_attribute) -# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) -#elif \ - !defined(HEDLEY_PGI_VERSION) && \ - !defined(HEDLEY_IAR_VERSION) && \ - (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) -# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute) -#else -# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_CPP_ATTRIBUTE) -# undef HEDLEY_GNUC_HAS_CPP_ATTRIBUTE -#endif -#if defined(__has_cpp_attribute) && defined(__cplusplus) -# define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) -#else -# define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_CPP_ATTRIBUTE) -# undef HEDLEY_GCC_HAS_CPP_ATTRIBUTE -#endif -#if defined(__has_cpp_attribute) && defined(__cplusplus) -# define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) -#else -# define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_BUILTIN) -# undef HEDLEY_HAS_BUILTIN -#endif -#if defined(__has_builtin) -# define HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin) -#else -# define HEDLEY_HAS_BUILTIN(builtin) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_BUILTIN) -# undef HEDLEY_GNUC_HAS_BUILTIN -#endif -#if defined(__has_builtin) -# define HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) -#else -# define HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_BUILTIN) -# undef HEDLEY_GCC_HAS_BUILTIN -#endif -#if defined(__has_builtin) -# define HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) -#else -# define HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_FEATURE) -# undef HEDLEY_HAS_FEATURE -#endif -#if defined(__has_feature) -# define HEDLEY_HAS_FEATURE(feature) __has_feature(feature) -#else -# define HEDLEY_HAS_FEATURE(feature) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_FEATURE) -# undef HEDLEY_GNUC_HAS_FEATURE -#endif -#if defined(__has_feature) -# define HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) -#else -# define HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_FEATURE) -# undef HEDLEY_GCC_HAS_FEATURE -#endif -#if defined(__has_feature) -# define HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) -#else -# define HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_EXTENSION) -# undef HEDLEY_HAS_EXTENSION -#endif -#if defined(__has_extension) -# define HEDLEY_HAS_EXTENSION(extension) __has_extension(extension) -#else -# define HEDLEY_HAS_EXTENSION(extension) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_EXTENSION) -# undef HEDLEY_GNUC_HAS_EXTENSION -#endif -#if defined(__has_extension) -# define HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) -#else -# define HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_EXTENSION) -# undef HEDLEY_GCC_HAS_EXTENSION -#endif -#if defined(__has_extension) -# define HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) -#else -# define HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_DECLSPEC_ATTRIBUTE) -# undef HEDLEY_HAS_DECLSPEC_ATTRIBUTE -#endif -#if defined(__has_declspec_attribute) -# define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute) -#else -# define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE) -# undef HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE -#endif -#if defined(__has_declspec_attribute) -# define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) -#else -# define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE) -# undef HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE -#endif -#if defined(__has_declspec_attribute) -# define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) -#else -# define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_WARNING) -# undef HEDLEY_HAS_WARNING -#endif -#if defined(__has_warning) -# define HEDLEY_HAS_WARNING(warning) __has_warning(warning) -#else -# define HEDLEY_HAS_WARNING(warning) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_WARNING) -# undef HEDLEY_GNUC_HAS_WARNING -#endif -#if defined(__has_warning) -# define HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) -#else -# define HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_WARNING) -# undef HEDLEY_GCC_HAS_WARNING -#endif -#if defined(__has_warning) -# define HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) -#else -# define HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ - defined(__clang__) || \ - HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ - HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \ - HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \ - (HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR)) -# define HEDLEY_PRAGMA(value) _Pragma(#value) -#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) -# define HEDLEY_PRAGMA(value) __pragma(value) -#else -# define HEDLEY_PRAGMA(value) -#endif - -#if defined(HEDLEY_DIAGNOSTIC_PUSH) -# undef HEDLEY_DIAGNOSTIC_PUSH -#endif -#if defined(HEDLEY_DIAGNOSTIC_POP) -# undef HEDLEY_DIAGNOSTIC_POP -#endif -#if defined(__clang__) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") -#elif HEDLEY_GCC_VERSION_CHECK(4,6,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") -#elif \ - HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push)) -# define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop)) -#elif HEDLEY_ARM_VERSION_CHECK(5,6,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("pop") -#elif \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop") -#elif HEDLEY_PELLES_VERSION_CHECK(2,90,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") -#else -# define HEDLEY_DIAGNOSTIC_PUSH -# define HEDLEY_DIAGNOSTIC_POP -#endif - -/* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for - HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) -# undef HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ -#endif -#if defined(__cplusplus) -# if HEDLEY_HAS_WARNING("-Wc++98-compat") -# if HEDLEY_HAS_WARNING("-Wc++17-extensions") -# if HEDLEY_HAS_WARNING("-Wc++1z-extensions") -# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ - _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ - _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \ - xpr \ - HEDLEY_DIAGNOSTIC_POP -# else -# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ - _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ - xpr \ - HEDLEY_DIAGNOSTIC_POP -# endif -# else -# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ - xpr \ - HEDLEY_DIAGNOSTIC_POP -# endif -# endif -#endif -#if !defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) -# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x -#endif - -#if defined(HEDLEY_CONST_CAST) -# undef HEDLEY_CONST_CAST -#endif -#if defined(__cplusplus) -# define HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) -#elif \ - HEDLEY_HAS_WARNING("-Wcast-qual") || \ - HEDLEY_GCC_VERSION_CHECK(4,6,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ - ((T) (expr)); \ - HEDLEY_DIAGNOSTIC_POP \ - })) -#else -# define HEDLEY_CONST_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(HEDLEY_REINTERPRET_CAST) -# undef HEDLEY_REINTERPRET_CAST -#endif -#if defined(__cplusplus) -# define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) -#else -# define HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(HEDLEY_STATIC_CAST) -# undef HEDLEY_STATIC_CAST -#endif -#if defined(__cplusplus) -# define HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) -#else -# define HEDLEY_STATIC_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(HEDLEY_CPP_CAST) -# undef HEDLEY_CPP_CAST -#endif -#if defined(__cplusplus) -# if HEDLEY_HAS_WARNING("-Wold-style-cast") -# define HEDLEY_CPP_CAST(T, expr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \ - ((T) (expr)) \ - HEDLEY_DIAGNOSTIC_POP -# elif HEDLEY_IAR_VERSION_CHECK(8,3,0) -# define HEDLEY_CPP_CAST(T, expr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("diag_suppress=Pe137") \ - HEDLEY_DIAGNOSTIC_POP -# else -# define HEDLEY_CPP_CAST(T, expr) ((T) (expr)) -# endif -#else -# define HEDLEY_CPP_CAST(T, expr) (expr) -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED) -# undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED -#endif -#if HEDLEY_HAS_WARNING("-Wdeprecated-declarations") -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)") -#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786)) -#elif HEDLEY_PGI_VERSION_CHECK(20,7,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445") -#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") -#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") -#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996)) -#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") -#elif \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215") -#elif HEDLEY_PELLES_VERSION_CHECK(2,90,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS) -# undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS -#endif -#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)") -#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161)) -#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675") -#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") -#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068)) -#elif \ - HEDLEY_TI_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") -#elif HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161") -#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES) -# undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES -#endif -#if HEDLEY_HAS_WARNING("-Wunknown-attributes") -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"") -#elif HEDLEY_GCC_VERSION_CHECK(4,6,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") -#elif HEDLEY_INTEL_VERSION_CHECK(17,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)") -#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292)) -#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030)) -#elif HEDLEY_PGI_VERSION_CHECK(20,7,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098") -#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)") -#elif \ - HEDLEY_TI_VERSION_CHECK(18,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097") -#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL) -# undef HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL -#endif -#if HEDLEY_HAS_WARNING("-Wcast-qual") -# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)") -#elif HEDLEY_GCC_VERSION_CHECK(3,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION) -# undef HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION -#endif -#if HEDLEY_HAS_WARNING("-Wunused-function") -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"") -#elif HEDLEY_GCC_VERSION_CHECK(3,4,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"") -#elif HEDLEY_MSVC_VERSION_CHECK(1,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505)) -#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION -#endif - -#if defined(HEDLEY_DEPRECATED) -# undef HEDLEY_DEPRECATED -#endif -#if defined(HEDLEY_DEPRECATED_FOR) -# undef HEDLEY_DEPRECATED_FOR -#endif -#if \ - HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since)) -# define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement)) -#elif \ - (HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(HEDLEY_IAR_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_TI_VERSION_CHECK(18,1,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since))) -# define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement))) -#elif defined(__cplusplus) && (__cplusplus >= 201402L) -# define HEDLEY_DEPRECATED(since) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]]) -# define HEDLEY_DEPRECATED_FOR(since, replacement) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]]) -#elif \ - HEDLEY_HAS_ATTRIBUTE(deprecated) || \ - HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - HEDLEY_IAR_VERSION_CHECK(8,10,0) -# define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__)) -# define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__)) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DEPRECATED(since) __declspec(deprecated) -# define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated) -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_DEPRECATED(since) _Pragma("deprecated") -# define HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated") -#else -# define HEDLEY_DEPRECATED(since) -# define HEDLEY_DEPRECATED_FOR(since, replacement) -#endif - -#if defined(HEDLEY_UNAVAILABLE) -# undef HEDLEY_UNAVAILABLE -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(warning) || \ - HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since))) -#else -# define HEDLEY_UNAVAILABLE(available_since) -#endif - -#if defined(HEDLEY_WARN_UNUSED_RESULT) -# undef HEDLEY_WARN_UNUSED_RESULT -#endif -#if defined(HEDLEY_WARN_UNUSED_RESULT_MSG) -# undef HEDLEY_WARN_UNUSED_RESULT_MSG -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__)) -#elif (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L) -# define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]]) -#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) -# define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) -#elif defined(_Check_return_) /* SAL */ -# define HEDLEY_WARN_UNUSED_RESULT _Check_return_ -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_ -#else -# define HEDLEY_WARN_UNUSED_RESULT -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) -#endif - -#if defined(HEDLEY_SENTINEL) -# undef HEDLEY_SENTINEL -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(sentinel) || \ - HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position))) -#else -# define HEDLEY_SENTINEL(position) -#endif - -#if defined(HEDLEY_NO_RETURN) -# undef HEDLEY_NO_RETURN -#endif -#if HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_NO_RETURN __noreturn -#elif \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_NO_RETURN __attribute__((__noreturn__)) -#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L -# define HEDLEY_NO_RETURN _Noreturn -#elif defined(__cplusplus) && (__cplusplus >= 201103L) -# define HEDLEY_NO_RETURN HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]]) -#elif \ - HEDLEY_HAS_ATTRIBUTE(noreturn) || \ - HEDLEY_GCC_VERSION_CHECK(3,2,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_IAR_VERSION_CHECK(8,10,0) -# define HEDLEY_NO_RETURN __attribute__((__noreturn__)) -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) -# define HEDLEY_NO_RETURN _Pragma("does_not_return") -#elif \ - HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_NO_RETURN __declspec(noreturn) -#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) -# define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;") -#elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) -# define HEDLEY_NO_RETURN __attribute((noreturn)) -#elif HEDLEY_PELLES_VERSION_CHECK(9,0,0) -# define HEDLEY_NO_RETURN __declspec(noreturn) -#else -# define HEDLEY_NO_RETURN -#endif - -#if defined(HEDLEY_NO_ESCAPE) -# undef HEDLEY_NO_ESCAPE -#endif -#if HEDLEY_HAS_ATTRIBUTE(noescape) -# define HEDLEY_NO_ESCAPE __attribute__((__noescape__)) -#else -# define HEDLEY_NO_ESCAPE -#endif - -#if defined(HEDLEY_UNREACHABLE) -# undef HEDLEY_UNREACHABLE -#endif -#if defined(HEDLEY_UNREACHABLE_RETURN) -# undef HEDLEY_UNREACHABLE_RETURN -#endif -#if defined(HEDLEY_ASSUME) -# undef HEDLEY_ASSUME -#endif -#if \ - HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_ASSUME(expr) __assume(expr) -#elif HEDLEY_HAS_BUILTIN(__builtin_assume) -# define HEDLEY_ASSUME(expr) __builtin_assume(expr) -#elif \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) -# if defined(__cplusplus) -# define HEDLEY_ASSUME(expr) std::_nassert(expr) -# else -# define HEDLEY_ASSUME(expr) _nassert(expr) -# endif -#endif -#if \ - (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(HEDLEY_ARM_VERSION))) || \ - HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ - HEDLEY_PGI_VERSION_CHECK(18,10,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,5) || \ - HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_UNREACHABLE() __builtin_unreachable() -#elif defined(HEDLEY_ASSUME) -# define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) -#endif -#if !defined(HEDLEY_ASSUME) -# if defined(HEDLEY_UNREACHABLE) -# define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (HEDLEY_UNREACHABLE(), 1))) -# else -# define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, expr) -# endif -#endif -#if defined(HEDLEY_UNREACHABLE) -# if \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) -# define HEDLEY_UNREACHABLE_RETURN(value) return (HEDLEY_STATIC_CAST(void, HEDLEY_ASSUME(0)), (value)) -# else -# define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE() -# endif -#else -# define HEDLEY_UNREACHABLE_RETURN(value) return (value) -#endif -#if !defined(HEDLEY_UNREACHABLE) -# define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) -#endif - -HEDLEY_DIAGNOSTIC_PUSH -#if HEDLEY_HAS_WARNING("-Wpedantic") -# pragma clang diagnostic ignored "-Wpedantic" -#endif -#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus) -# pragma clang diagnostic ignored "-Wc++98-compat-pedantic" -#endif -#if HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0) -# if defined(__clang__) -# pragma clang diagnostic ignored "-Wvariadic-macros" -# elif defined(HEDLEY_GCC_VERSION) -# pragma GCC diagnostic ignored "-Wvariadic-macros" -# endif -#endif -#if defined(HEDLEY_NON_NULL) -# undef HEDLEY_NON_NULL -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(nonnull) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) -# define HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__))) -#else -# define HEDLEY_NON_NULL(...) -#endif -HEDLEY_DIAGNOSTIC_POP - -#if defined(HEDLEY_PRINTF_FORMAT) -# undef HEDLEY_PRINTF_FORMAT -#endif -#if defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO) -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check))) -#elif defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO) -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check))) -#elif \ - HEDLEY_HAS_ATTRIBUTE(format) || \ - HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check))) -#elif HEDLEY_PELLES_VERSION_CHECK(6,0,0) -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check)) -#else -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) -#endif - -#if defined(HEDLEY_CONSTEXPR) -# undef HEDLEY_CONSTEXPR -#endif -#if defined(__cplusplus) -# if __cplusplus >= 201103L -# define HEDLEY_CONSTEXPR HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr) -# endif -#endif -#if !defined(HEDLEY_CONSTEXPR) -# define HEDLEY_CONSTEXPR -#endif - -#if defined(HEDLEY_PREDICT) -# undef HEDLEY_PREDICT -#endif -#if defined(HEDLEY_LIKELY) -# undef HEDLEY_LIKELY -#endif -#if defined(HEDLEY_UNLIKELY) -# undef HEDLEY_UNLIKELY -#endif -#if defined(HEDLEY_UNPREDICTABLE) -# undef HEDLEY_UNPREDICTABLE -#endif -#if HEDLEY_HAS_BUILTIN(__builtin_unpredictable) -# define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr)) -#endif -#if \ - (HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(HEDLEY_PGI_VERSION) && !defined(HEDLEY_INTEL_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(9,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability( (expr), (value), (probability)) -# define HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1 , (probability)) -# define HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0 , (probability)) -# define HEDLEY_LIKELY(expr) __builtin_expect (!!(expr), 1 ) -# define HEDLEY_UNLIKELY(expr) __builtin_expect (!!(expr), 0 ) -#elif \ - (HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(HEDLEY_INTEL_CL_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PREDICT(expr, expected, probability) \ - (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (HEDLEY_STATIC_CAST(void, expected), (expr))) -# define HEDLEY_PREDICT_TRUE(expr, probability) \ - (__extension__ ({ \ - double hedley_probability_ = (probability); \ - ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \ - })) -# define HEDLEY_PREDICT_FALSE(expr, probability) \ - (__extension__ ({ \ - double hedley_probability_ = (probability); \ - ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \ - })) -# define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) -# define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) -#else -# define HEDLEY_PREDICT(expr, expected, probability) (HEDLEY_STATIC_CAST(void, expected), (expr)) -# define HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr)) -# define HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr)) -# define HEDLEY_LIKELY(expr) (!!(expr)) -# define HEDLEY_UNLIKELY(expr) (!!(expr)) -#endif -#if !defined(HEDLEY_UNPREDICTABLE) -# define HEDLEY_UNPREDICTABLE(expr) HEDLEY_PREDICT(expr, 1, 0.5) -#endif - -#if defined(HEDLEY_MALLOC) -# undef HEDLEY_MALLOC -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(malloc) || \ - HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_MALLOC __attribute__((__malloc__)) -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) -# define HEDLEY_MALLOC _Pragma("returns_new_memory") -#elif \ - HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_MALLOC __declspec(restrict) -#else -# define HEDLEY_MALLOC -#endif - -#if defined(HEDLEY_PURE) -# undef HEDLEY_PURE -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(pure) || \ - HEDLEY_GCC_VERSION_CHECK(2,96,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PURE __attribute__((__pure__)) -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) -# define HEDLEY_PURE _Pragma("does_not_write_global_data") -#elif defined(__cplusplus) && \ - ( \ - HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \ - ) -# define HEDLEY_PURE _Pragma("FUNC_IS_PURE;") -#else -# define HEDLEY_PURE -#endif - -#if defined(HEDLEY_CONST) -# undef HEDLEY_CONST -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(const) || \ - HEDLEY_GCC_VERSION_CHECK(2,5,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_CONST __attribute__((__const__)) -#elif \ - HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) -# define HEDLEY_CONST _Pragma("no_side_effect") -#else -# define HEDLEY_CONST HEDLEY_PURE -#endif - -#if defined(HEDLEY_RESTRICT) -# undef HEDLEY_RESTRICT -#endif -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus) -# define HEDLEY_RESTRICT restrict -#elif \ - HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \ - HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ - defined(__clang__) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_RESTRICT __restrict -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus) -# define HEDLEY_RESTRICT _Restrict -#else -# define HEDLEY_RESTRICT -#endif - -#if defined(HEDLEY_INLINE) -# undef HEDLEY_INLINE -#endif -#if \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ - (defined(__cplusplus) && (__cplusplus >= 199711L)) -# define HEDLEY_INLINE inline -#elif \ - defined(HEDLEY_GCC_VERSION) || \ - HEDLEY_ARM_VERSION_CHECK(6,2,0) -# define HEDLEY_INLINE __inline__ -#elif \ - HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_INLINE __inline -#else -# define HEDLEY_INLINE -#endif - -#if defined(HEDLEY_ALWAYS_INLINE) -# undef HEDLEY_ALWAYS_INLINE -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(always_inline) || \ - HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - HEDLEY_IAR_VERSION_CHECK(8,10,0) -# define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE -#elif \ - HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_ALWAYS_INLINE __forceinline -#elif defined(__cplusplus) && \ - ( \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \ - ) -# define HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_ALWAYS_INLINE _Pragma("inline=forced") -#else -# define HEDLEY_ALWAYS_INLINE HEDLEY_INLINE -#endif - -#if defined(HEDLEY_NEVER_INLINE) -# undef HEDLEY_NEVER_INLINE -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(noinline) || \ - HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - HEDLEY_IAR_VERSION_CHECK(8,10,0) -# define HEDLEY_NEVER_INLINE __attribute__((__noinline__)) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_NEVER_INLINE __declspec(noinline) -#elif HEDLEY_PGI_VERSION_CHECK(10,2,0) -# define HEDLEY_NEVER_INLINE _Pragma("noinline") -#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) -# define HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_NEVER_INLINE _Pragma("inline=never") -#elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) -# define HEDLEY_NEVER_INLINE __attribute((noinline)) -#elif HEDLEY_PELLES_VERSION_CHECK(9,0,0) -# define HEDLEY_NEVER_INLINE __declspec(noinline) -#else -# define HEDLEY_NEVER_INLINE -#endif - -#if defined(HEDLEY_PRIVATE) -# undef HEDLEY_PRIVATE -#endif -#if defined(HEDLEY_PUBLIC) -# undef HEDLEY_PUBLIC -#endif -#if defined(HEDLEY_IMPORT) -# undef HEDLEY_IMPORT -#endif -#if defined(_WIN32) || defined(__CYGWIN__) -# define HEDLEY_PRIVATE -# define HEDLEY_PUBLIC __declspec(dllexport) -# define HEDLEY_IMPORT __declspec(dllimport) -#else -# if \ - HEDLEY_HAS_ATTRIBUTE(visibility) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - ( \ - defined(__TI_EABI__) && \ - ( \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \ - ) \ - ) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PRIVATE __attribute__((__visibility__("hidden"))) -# define HEDLEY_PUBLIC __attribute__((__visibility__("default"))) -# else -# define HEDLEY_PRIVATE -# define HEDLEY_PUBLIC -# endif -# define HEDLEY_IMPORT extern -#endif - -#if defined(HEDLEY_NO_THROW) -# undef HEDLEY_NO_THROW -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(nothrow) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_NO_THROW __attribute__((__nothrow__)) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) -# define HEDLEY_NO_THROW __declspec(nothrow) -#else -# define HEDLEY_NO_THROW -#endif - -#if defined(HEDLEY_FALL_THROUGH) -# undef HEDLEY_FALL_THROUGH -#endif -#if defined(HEDLEY_INTEL_VERSION) -# define HEDLEY_FALL_THROUGH -#elif \ - HEDLEY_HAS_ATTRIBUTE(fallthrough) || \ - HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) -#elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough) -# define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]]) -#elif HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough) -# define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]]) -#elif defined(__fallthrough) /* SAL */ -# define HEDLEY_FALL_THROUGH __fallthrough -#else -# define HEDLEY_FALL_THROUGH -#endif - -#if defined(HEDLEY_RETURNS_NON_NULL) -# undef HEDLEY_RETURNS_NON_NULL -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \ - HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__)) -#elif defined(_Ret_notnull_) /* SAL */ -# define HEDLEY_RETURNS_NON_NULL _Ret_notnull_ -#else -# define HEDLEY_RETURNS_NON_NULL -#endif - -#if defined(HEDLEY_ARRAY_PARAM) -# undef HEDLEY_ARRAY_PARAM -#endif -#if \ - defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \ - !defined(__STDC_NO_VLA__) && \ - !defined(__cplusplus) && \ - !defined(HEDLEY_PGI_VERSION) && \ - !defined(HEDLEY_TINYC_VERSION) -# define HEDLEY_ARRAY_PARAM(name) (name) -#else -# define HEDLEY_ARRAY_PARAM(name) -#endif - -#if defined(HEDLEY_IS_CONSTANT) -# undef HEDLEY_IS_CONSTANT -#endif -#if defined(HEDLEY_REQUIRE_CONSTEXPR) -# undef HEDLEY_REQUIRE_CONSTEXPR -#endif -/* HEDLEY_IS_CONSTEXPR_ is for - HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ -#if defined(HEDLEY_IS_CONSTEXPR_) -# undef HEDLEY_IS_CONSTEXPR_ -#endif -#if \ - HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr) -#endif -#if !defined(__cplusplus) -# if \ - HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,24) -# if defined(__INTPTR_TYPE__) -# define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*) -# else -# include -# define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*) -# endif -# elif \ - ( \ - defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ - !defined(HEDLEY_SUNPRO_VERSION) && \ - !defined(HEDLEY_PGI_VERSION) && \ - !defined(HEDLEY_IAR_VERSION)) || \ - (HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(HEDLEY_IAR_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ - HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,3,0) -# if defined(__INTPTR_TYPE__) -# define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0) -# else -# include -# define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0) -# endif -# elif \ - defined(HEDLEY_GCC_VERSION) || \ - defined(HEDLEY_INTEL_VERSION) || \ - defined(HEDLEY_TINYC_VERSION) || \ - defined(HEDLEY_TI_ARMCL_VERSION) || \ - HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \ - defined(HEDLEY_TI_CL2000_VERSION) || \ - defined(HEDLEY_TI_CL6X_VERSION) || \ - defined(HEDLEY_TI_CL7X_VERSION) || \ - defined(HEDLEY_TI_CLPRU_VERSION) || \ - defined(__clang__) -# define HEDLEY_IS_CONSTEXPR_(expr) ( \ - sizeof(void) != \ - sizeof(*( \ - 1 ? \ - ((void*) ((expr) * 0L) ) : \ - ((struct { char v[sizeof(void) * 2]; } *) 1) \ - ) \ - ) \ - ) -# endif -#endif -#if defined(HEDLEY_IS_CONSTEXPR_) -# if !defined(HEDLEY_IS_CONSTANT) -# define HEDLEY_IS_CONSTANT(expr) HEDLEY_IS_CONSTEXPR_(expr) -# endif -# define HEDLEY_REQUIRE_CONSTEXPR(expr) (HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1)) -#else -# if !defined(HEDLEY_IS_CONSTANT) -# define HEDLEY_IS_CONSTANT(expr) (0) -# endif -# define HEDLEY_REQUIRE_CONSTEXPR(expr) (expr) -#endif - -#if defined(HEDLEY_BEGIN_C_DECLS) -# undef HEDLEY_BEGIN_C_DECLS -#endif -#if defined(HEDLEY_END_C_DECLS) -# undef HEDLEY_END_C_DECLS -#endif -#if defined(HEDLEY_C_DECL) -# undef HEDLEY_C_DECL -#endif -#if defined(__cplusplus) -# define HEDLEY_BEGIN_C_DECLS extern "C" { -# define HEDLEY_END_C_DECLS } -# define HEDLEY_C_DECL extern "C" -#else -# define HEDLEY_BEGIN_C_DECLS -# define HEDLEY_END_C_DECLS -# define HEDLEY_C_DECL -#endif - -#if defined(HEDLEY_STATIC_ASSERT) -# undef HEDLEY_STATIC_ASSERT -#endif -#if \ - !defined(__cplusplus) && ( \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ - (HEDLEY_HAS_FEATURE(c_static_assert) && !defined(HEDLEY_INTEL_CL_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - defined(_Static_assert) \ - ) -# define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message) -#elif \ - (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ - HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) -#else -# define HEDLEY_STATIC_ASSERT(expr, message) -#endif - -#if defined(HEDLEY_NULL) -# undef HEDLEY_NULL -#endif -#if defined(__cplusplus) -# if __cplusplus >= 201103L -# define HEDLEY_NULL HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr) -# elif defined(NULL) -# define HEDLEY_NULL NULL -# else -# define HEDLEY_NULL HEDLEY_STATIC_CAST(void*, 0) -# endif -#elif defined(NULL) -# define HEDLEY_NULL NULL -#else -# define HEDLEY_NULL ((void*) 0) -#endif - -#if defined(HEDLEY_MESSAGE) -# undef HEDLEY_MESSAGE -#endif -#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") -# define HEDLEY_MESSAGE(msg) \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ - HEDLEY_PRAGMA(message msg) \ - HEDLEY_DIAGNOSTIC_POP -#elif \ - HEDLEY_GCC_VERSION_CHECK(4,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message msg) -#elif HEDLEY_CRAY_VERSION_CHECK(5,0,0) -# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(_CRI message msg) -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg)) -#elif HEDLEY_PELLES_VERSION_CHECK(2,0,0) -# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg)) -#else -# define HEDLEY_MESSAGE(msg) -#endif - -#if defined(HEDLEY_WARNING) -# undef HEDLEY_WARNING -#endif -#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") -# define HEDLEY_WARNING(msg) \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ - HEDLEY_PRAGMA(clang warning msg) \ - HEDLEY_DIAGNOSTIC_POP -#elif \ - HEDLEY_GCC_VERSION_CHECK(4,8,0) || \ - HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg)) -#else -# define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg) -#endif - -#if defined(HEDLEY_REQUIRE) -# undef HEDLEY_REQUIRE -#endif -#if defined(HEDLEY_REQUIRE_MSG) -# undef HEDLEY_REQUIRE_MSG -#endif -#if HEDLEY_HAS_ATTRIBUTE(diagnose_if) -# if HEDLEY_HAS_WARNING("-Wgcc-compat") -# define HEDLEY_REQUIRE(expr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ - __attribute__((diagnose_if(!(expr), #expr, "error"))) \ - HEDLEY_DIAGNOSTIC_POP -# define HEDLEY_REQUIRE_MSG(expr,msg) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ - __attribute__((diagnose_if(!(expr), msg, "error"))) \ - HEDLEY_DIAGNOSTIC_POP -# else -# define HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error"))) -# define HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error"))) -# endif -#else -# define HEDLEY_REQUIRE(expr) -# define HEDLEY_REQUIRE_MSG(expr,msg) -#endif - -#if defined(HEDLEY_FLAGS) -# undef HEDLEY_FLAGS -#endif -#if HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion")) -# define HEDLEY_FLAGS __attribute__((__flag_enum__)) -#else -# define HEDLEY_FLAGS -#endif - -#if defined(HEDLEY_FLAGS_CAST) -# undef HEDLEY_FLAGS_CAST -#endif -#if HEDLEY_INTEL_VERSION_CHECK(19,0,0) -# define HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("warning(disable:188)") \ - ((T) (expr)); \ - HEDLEY_DIAGNOSTIC_POP \ - })) -#else -# define HEDLEY_FLAGS_CAST(T, expr) HEDLEY_STATIC_CAST(T, expr) -#endif - -#if defined(HEDLEY_EMPTY_BASES) -# undef HEDLEY_EMPTY_BASES -#endif -#if \ - (HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_EMPTY_BASES __declspec(empty_bases) -#else -# define HEDLEY_EMPTY_BASES -#endif - -/* Remaining macros are deprecated. */ - -#if defined(HEDLEY_GCC_NOT_CLANG_VERSION_CHECK) -# undef HEDLEY_GCC_NOT_CLANG_VERSION_CHECK -#endif -#if defined(__clang__) -# define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0) -#else -# define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_CLANG_HAS_ATTRIBUTE) -# undef HEDLEY_CLANG_HAS_ATTRIBUTE -#endif -#define HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) HEDLEY_HAS_ATTRIBUTE(attribute) - -#if defined(HEDLEY_CLANG_HAS_CPP_ATTRIBUTE) -# undef HEDLEY_CLANG_HAS_CPP_ATTRIBUTE -#endif -#define HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) HEDLEY_HAS_CPP_ATTRIBUTE(attribute) - -#if defined(HEDLEY_CLANG_HAS_BUILTIN) -# undef HEDLEY_CLANG_HAS_BUILTIN -#endif -#define HEDLEY_CLANG_HAS_BUILTIN(builtin) HEDLEY_HAS_BUILTIN(builtin) - -#if defined(HEDLEY_CLANG_HAS_FEATURE) -# undef HEDLEY_CLANG_HAS_FEATURE -#endif -#define HEDLEY_CLANG_HAS_FEATURE(feature) HEDLEY_HAS_FEATURE(feature) - -#if defined(HEDLEY_CLANG_HAS_EXTENSION) -# undef HEDLEY_CLANG_HAS_EXTENSION -#endif -#define HEDLEY_CLANG_HAS_EXTENSION(extension) HEDLEY_HAS_EXTENSION(extension) - -#if defined(HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE) -# undef HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE -#endif -#define HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) - -#if defined(HEDLEY_CLANG_HAS_WARNING) -# undef HEDLEY_CLANG_HAS_WARNING -#endif -#define HEDLEY_CLANG_HAS_WARNING(warning) HEDLEY_HAS_WARNING(warning) - -#endif /* !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < X) */ diff --git a/extern/simde/simde-aes.h b/extern/simde/simde-aes.h deleted file mode 100644 index 3ba650eaf..000000000 --- a/extern/simde/simde-aes.h +++ /dev/null @@ -1,265 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) - */ - -#if !defined(SIMDE_AES_H) -#define SIMDE_AES_H - -#include "simde-features.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS - -#if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) - -/* - * Number of columns (32-bit words) comprising the State. For this - * standard, Nb = 4. - */ -#define simde_x_aes_Nb 4 - -static uint8_t simde_x_aes_gmult_lookup_table[8][256] = { -{ // gmult(0x02, b); - 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, - 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, - 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, - 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, - 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, - 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, - 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, - 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, - 0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05, - 0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25, - 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, - 0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, - 0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85, - 0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, - 0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5, - 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5 -}, -{ // gmult(0x01, b); - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, - 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, - 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, - 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, - 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, - 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, - 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, - 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, - 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, - 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, - 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, - 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, - 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, - 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, -}, -{ // gmult(0x01, b); - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, - 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, - 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, - 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, - 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, - 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, - 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, - 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, - 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, - 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, - 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, - 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, - 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, - 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, -}, -{ // gmult(0x03, b); - 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, - 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, - 0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71, - 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41, - 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, - 0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1, - 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1, - 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, - 0x9b, 0x98, 0x9d, 0x9e, 0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a, - 0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6, 0xbf, 0xbc, 0xb9, 0xba, - 0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2, 0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea, - 0xcb, 0xc8, 0xcd, 0xce, 0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda, - 0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46, 0x4f, 0x4c, 0x49, 0x4a, - 0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62, 0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a, - 0x3b, 0x38, 0x3d, 0x3e, 0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a, - 0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a, -}, -{ // gmult(0x0e, b); - 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, - 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, - 0xdb, 0xd5, 0xc7, 0xc9, 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81, - 0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, 0x73, 0x7d, 0x6f, 0x61, - 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, - 0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17, - 0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, 0x3e, 0x30, 0x22, 0x2c, - 0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc, - 0x41, 0x4f, 0x5d, 0x53, 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b, - 0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, 0xe9, 0xe7, 0xf5, 0xfb, - 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, - 0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, - 0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, 0xa4, 0xaa, 0xb8, 0xb6, - 0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56, - 0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, - 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d, -}, -{ // gmult(0x09, b); - 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, - 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, - 0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c, - 0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, 0xc7, 0xce, 0xd5, 0xdc, - 0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01, - 0xe6, 0xef, 0xf4, 0xfd, 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91, - 0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, 0x21, 0x28, 0x33, 0x3a, - 0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa, - 0xec, 0xe5, 0xfe, 0xf7, 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b, - 0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, 0x10, 0x19, 0x02, 0x0b, - 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, - 0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, - 0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, 0xf6, 0xff, 0xe4, 0xed, - 0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d, - 0xa1, 0xa8, 0xb3, 0xba, 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6, - 0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46, - -}, -{ // gmult(0x0d, b); - 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, - 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, - 0xbb, 0xb6, 0xa1, 0xac, 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0, - 0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, 0x37, 0x3a, 0x2d, 0x20, - 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, - 0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, - 0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, 0x8a, 0x87, 0x90, 0x9d, - 0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d, - 0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, - 0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41, - 0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a, - 0xb1, 0xbc, 0xab, 0xa6, 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa, - 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc, - 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, - 0x0c, 0x01, 0x16, 0x1b, 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47, - 0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97, -}, -{ // gmult(0x0b, b); - 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, - 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, - 0x7b, 0x70, 0x6d, 0x66, 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12, - 0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, 0xbf, 0xb4, 0xa9, 0xa2, - 0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f, - 0x46, 0x4d, 0x50, 0x5b, 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f, - 0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, 0xf9, 0xf2, 0xef, 0xe4, - 0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54, - 0xf7, 0xfc, 0xe1, 0xea, 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e, - 0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, 0x33, 0x38, 0x25, 0x2e, - 0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5, - 0x3c, 0x37, 0x2a, 0x21, 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55, - 0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68, - 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, - 0x7a, 0x71, 0x6c, 0x67, 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13, - 0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3, -} -}; - -/* - * S-box transformation table - */ -static uint8_t simde_x_aes_s_box[256] = { - // 0 1 2 3 4 5 6 7 8 9 a b c d e f - 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, // 0 - 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, // 1 - 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, // 2 - 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, // 3 - 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, // 4 - 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, // 5 - 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, // 6 - 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, // 7 - 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, // 8 - 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, // 9 - 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, // a - 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, // b - 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, // c - 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, // d - 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, // e - 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};// f - -/* - * Inverse S-box transformation table - */ -static uint8_t simde_x_aes_inv_s_box[256] = { - // 0 1 2 3 4 5 6 7 8 9 a b c d e f - 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, // 0 - 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, // 1 - 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, // 2 - 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, // 3 - 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, // 4 - 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, // 5 - 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, // 6 - 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, // 7 - 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, // 8 - 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, // 9 - 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, // a - 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, // b - 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, // c - 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, // d - 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, // e - 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d};// f - -/* - * Multiplication of 4 byte words - * m(x) = x4+1 - -SIMDE_FUNCTION_ATTRIBUTES -void coef_mult(uint8_t *a, uint8_t *b, uint8_t *d) { - - d[0] = gmult(a[0],b[0])^gmult(a[3],b[1])^gmult(a[2],b[2])^gmult(a[1],b[3]); - d[1] = gmult(a[1],b[0])^gmult(a[0],b[1])^gmult(a[3],b[2])^gmult(a[2],b[3]); - d[2] = gmult(a[2],b[0])^gmult(a[1],b[1])^gmult(a[0],b[2])^gmult(a[3],b[3]); - d[3] = gmult(a[3],b[0])^gmult(a[2],b[1])^gmult(a[1],b[2])^gmult(a[0],b[3]); -} -*/ - -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_coef_mult_lookup(int lookup_table_offset, uint8_t *b, uint8_t *d) { - int o = lookup_table_offset; - - #define gmultl(o,b) simde_x_aes_gmult_lookup_table[o][b] - d[0] = gmultl(o+0,b[0])^gmultl(o+3,b[1])^gmultl(o+2,b[2])^gmultl(o+1,b[3]); - d[1] = gmultl(o+1,b[0])^gmultl(o+0,b[1])^gmultl(o+3,b[2])^gmultl(o+2,b[3]); - d[2] = gmultl(o+2,b[0])^gmultl(o+1,b[1])^gmultl(o+0,b[2])^gmultl(o+3,b[3]); - d[3] = gmultl(o+3,b[0])^gmultl(o+2,b[1])^gmultl(o+1,b[2])^gmultl(o+0,b[3]); - #undef gmultl -} - -#endif - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_AES_H) */ diff --git a/extern/simde/simde-align.h b/extern/simde/simde-align.h deleted file mode 100644 index 0c8a809ee..000000000 --- a/extern/simde/simde-align.h +++ /dev/null @@ -1,450 +0,0 @@ -/* Alignment - * Created by Evan Nemerson - * - * To the extent possible under law, the authors have waived all - * copyright and related or neighboring rights to this code. For - * details, see the Creative Commons Zero 1.0 Universal license at - * - * - * SPDX-License-Identifier: CC0-1.0 - * - ********************************************************************** - * - * This is portability layer which should help iron out some - * differences across various compilers, as well as various verisons of - * C and C++. - * - * It was originally developed for SIMD Everywhere - * (), but since its only - * dependency is Hedley (, also CC0) - * it can easily be used in other projects, so please feel free to do - * so. - * - * If you do use this in your project, please keep a link to SIMDe in - * your code to remind you where to report any bugs and/or check for - * updated versions. - * - * # API Overview - * - * The API has several parts, and most macros have a few variations. - * There are APIs for declaring aligned fields/variables, optimization - * hints, and run-time alignment checks. - * - * Briefly, macros ending with "_TO" take numeric values and are great - * when you know the value you would like to use. Macros ending with - * "_LIKE", on the other hand, accept a type and are used when you want - * to use the alignment of a type instead of hardcoding a value. - * - * Documentation for each section of the API is inline. - * - * True to form, MSVC is the main problem and imposes several - * limitations on the effectiveness of the APIs. Detailed descriptions - * of the limitations of each macro are inline, but in general: - * - * * On C11+ or C++11+ code written using this API will work. The - * ASSUME macros may or may not generate a hint to the compiler, but - * that is only an optimization issue and will not actually cause - * failures. - * * If you're using pretty much any compiler other than MSVC, - * everything should basically work as well as in C11/C++11. - */ - -#if !defined(SIMDE_ALIGN_H) -#define SIMDE_ALIGN_H - -#include "hedley.h" - -/* I know this seems a little silly, but some non-hosted compilers - * don't have stddef.h, so we try to accomodate them. */ -#if !defined(SIMDE_ALIGN_SIZE_T_) - #if defined(__SIZE_TYPE__) - #define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__ - #elif defined(__SIZE_T_TYPE__) - #define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__ - #elif defined(__cplusplus) - #include - #define SIMDE_ALIGN_SIZE_T_ size_t - #else - #include - #define SIMDE_ALIGN_SIZE_T_ size_t - #endif -#endif - -#if !defined(SIMDE_ALIGN_INTPTR_T_) - #if defined(__INTPTR_TYPE__) - #define SIMDE_ALIGN_INTPTR_T_ __INTPTR_TYPE__ - #elif defined(__PTRDIFF_TYPE__) - #define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_TYPE__ - #elif defined(__PTRDIFF_T_TYPE__) - #define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_T_TYPE__ - #elif defined(__cplusplus) - #include - #define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t - #else - #include - #define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t - #endif -#endif - -#if defined(SIMDE_ALIGN_DEBUG) - #if defined(__cplusplus) - #include - #else - #include - #endif -#endif - -/* SIMDE_ALIGN_OF(Type) - * - * The SIMDE_ALIGN_OF macro works like alignof, or _Alignof, or - * __alignof, or __alignof__, or __ALIGNOF__, depending on the compiler. - * It isn't defined everywhere (only when the compiler has some alignof- - * like feature we can use to implement it), but it should work in most - * modern compilers, as well as C11 and C++11. - * - * If we can't find an implementation for SIMDE_ALIGN_OF then the macro - * will not be defined, so if you can handle that situation sensibly - * you may need to sprinkle some ifdefs into your code. - */ -#if \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ - (0 && HEDLEY_HAS_FEATURE(c_alignof)) - #define SIMDE_ALIGN_OF(Type) _Alignof(Type) -#elif \ - (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ - (0 && HEDLEY_HAS_FEATURE(cxx_alignof)) - #define SIMDE_ALIGN_OF(Type) alignof(Type) -#elif \ - HEDLEY_GCC_VERSION_CHECK(2,95,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,24) || \ - HEDLEY_PGI_VERSION_CHECK(19,10,0) || \ - HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - defined(__IBM__ALIGNOF__) || \ - defined(__clang__) - #define SIMDE_ALIGN_OF(Type) __alignof__(Type) -#elif \ - HEDLEY_IAR_VERSION_CHECK(8,40,0) - #define SIMDE_ALIGN_OF(Type) __ALIGNOF__(Type) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(19,0,0) - /* Probably goes back much further, but MS takes down their old docs. - * If you can verify that this works in earlier versions please let - * me know! */ - #define SIMDE_ALIGN_OF(Type) __alignof(Type) -#endif - -/* SIMDE_ALIGN_MAXIMUM: - * - * This is the maximum alignment that the compiler supports. You can - * define the value prior to including SIMDe if necessary, but in that - * case *please* submit an issue so we can add the platform to the - * detection code. - * - * Most compilers are okay with types which are aligned beyond what - * they think is the maximum, as long as the alignment is a power - * of two. Older versions of MSVC is the exception, so we need to cap - * the alignment requests at values that the implementation supports. - * - * XL C/C++ will accept values larger than 16 (which is the alignment - * of an AltiVec vector), but will not reliably align to the larger - * value, so so we cap the value at 16 there. - * - * If the compiler accepts any power-of-two value within reason then - * this macro should be left undefined, and the SIMDE_ALIGN_CAP - * macro will just return the value passed to it. */ -#if !defined(SIMDE_ALIGN_MAXIMUM) - #if defined(HEDLEY_MSVC_VERSION) - #if HEDLEY_MSVC_VERSION_CHECK(19, 16, 0) - // Visual studio 2017 and newer does not need a max - #else - #if defined(_M_IX86) || defined(_M_AMD64) - #if HEDLEY_MSVC_VERSION_CHECK(19,14,0) - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 64 - #elif HEDLEY_MSVC_VERSION_CHECK(16,0,0) - /* VS 2010 is really a guess based on Wikipedia; if anyone can - * test with old VS versions I'd really appreciate it. */ - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 32 - #else - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 16 - #endif - #elif defined(_M_ARM) || defined(_M_ARM64) - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 8 - #endif - #endif - #elif defined(HEDLEY_IBM_VERSION) - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 16 - #endif -#endif - -/* You can mostly ignore these; they're intended for internal use. - * If you do need to use them please let me know; if they fulfill - * a common use case I'll probably drop the trailing underscore - * and make them part of the public API. */ -#if defined(SIMDE_ALIGN_PLATFORM_MAXIMUM) - #if SIMDE_ALIGN_PLATFORM_MAXIMUM >= 64 - #define SIMDE_ALIGN_64_ 64 - #define SIMDE_ALIGN_32_ 32 - #define SIMDE_ALIGN_16_ 16 - #define SIMDE_ALIGN_8_ 8 - #elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 32 - #define SIMDE_ALIGN_64_ 32 - #define SIMDE_ALIGN_32_ 32 - #define SIMDE_ALIGN_16_ 16 - #define SIMDE_ALIGN_8_ 8 - #elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 16 - #define SIMDE_ALIGN_64_ 16 - #define SIMDE_ALIGN_32_ 16 - #define SIMDE_ALIGN_16_ 16 - #define SIMDE_ALIGN_8_ 8 - #elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 8 - #define SIMDE_ALIGN_64_ 8 - #define SIMDE_ALIGN_32_ 8 - #define SIMDE_ALIGN_16_ 8 - #define SIMDE_ALIGN_8_ 8 - #else - #error Max alignment expected to be >= 8 - #endif -#else - #define SIMDE_ALIGN_64_ 64 - #define SIMDE_ALIGN_32_ 32 - #define SIMDE_ALIGN_16_ 16 - #define SIMDE_ALIGN_8_ 8 -#endif - -/** - * SIMDE_ALIGN_CAP(Alignment) - * - * Returns the minimum of Alignment or SIMDE_ALIGN_MAXIMUM. - */ -#if defined(SIMDE_ALIGN_MAXIMUM) - #define SIMDE_ALIGN_CAP(Alignment) (((Alignment) < (SIMDE_ALIGN_PLATFORM_MAXIMUM)) ? (Alignment) : (SIMDE_ALIGN_PLATFORM_MAXIMUM)) -#else - #define SIMDE_ALIGN_CAP(Alignment) (Alignment) -#endif - -/* SIMDE_ALIGN_TO(Alignment) - * - * SIMDE_ALIGN_TO is used to declare types or variables. It basically - * maps to the align attribute in most compilers, the align declspec - * in MSVC, or _Alignas/alignas in C11/C++11. - * - * Example: - * - * struct i32x4 { - * SIMDE_ALIGN_TO(16) int32_t values[4]; - * } - * - * Limitations: - * - * MSVC requires that the Alignment parameter be numeric; you can't do - * something like `SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(int))`. This is - * unfortunate because that's really how the LIKE macros are - * implemented, and I am not aware of a way to get anything like this - * to work without using the C11/C++11 keywords. - * - * It also means that we can't use SIMDE_ALIGN_CAP to limit the - * alignment to the value specified, which MSVC also requires, so on - * MSVC you should use the `SIMDE_ALIGN_TO_8/16/32/64` macros instead. - * They work like `SIMDE_ALIGN_TO(SIMDE_ALIGN_CAP(Alignment))` would, - * but should be safe to use on MSVC. - * - * All this is to say that, if you want your code to work on MSVC, you - * should use the SIMDE_ALIGN_TO_8/16/32/64 macros below instead of - * SIMDE_ALIGN_TO(8/16/32/64). - */ -#if \ - HEDLEY_HAS_ATTRIBUTE(aligned) || \ - HEDLEY_GCC_VERSION_CHECK(2,95,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,4,0) || \ - HEDLEY_IBM_VERSION_CHECK(11,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_PGI_VERSION_CHECK(19,4,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,24) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2) - #define SIMDE_ALIGN_TO(Alignment) __attribute__((__aligned__(SIMDE_ALIGN_CAP(Alignment)))) -#elif \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) - #define SIMDE_ALIGN_TO(Alignment) _Alignas(SIMDE_ALIGN_CAP(Alignment)) -#elif \ - (defined(__cplusplus) && (__cplusplus >= 201103L)) - #define SIMDE_ALIGN_TO(Alignment) alignas(SIMDE_ALIGN_CAP(Alignment)) -#elif \ - defined(HEDLEY_MSVC_VERSION) - #define SIMDE_ALIGN_TO(Alignment) __declspec(align(Alignment)) - /* Unfortunately MSVC can't handle __declspec(align(__alignof(Type))); - * the alignment passed to the declspec has to be an integer. */ - #define SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE -#endif -#define SIMDE_ALIGN_TO_64 SIMDE_ALIGN_TO(SIMDE_ALIGN_64_) -#define SIMDE_ALIGN_TO_32 SIMDE_ALIGN_TO(SIMDE_ALIGN_32_) -#define SIMDE_ALIGN_TO_16 SIMDE_ALIGN_TO(SIMDE_ALIGN_16_) -#define SIMDE_ALIGN_TO_8 SIMDE_ALIGN_TO(SIMDE_ALIGN_8_) - -/* SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) - * - * SIMDE_ALIGN_ASSUME_TO is semantically similar to C++20's - * std::assume_aligned, or __builtin_assume_aligned. It tells the - * compiler to assume that the provided pointer is aligned to an - * `Alignment`-byte boundary. - * - * If you define SIMDE_ALIGN_DEBUG prior to including this header then - * SIMDE_ALIGN_ASSUME_TO will turn into a runtime check. We don't - * integrate with NDEBUG in this header, but it may be a good idea to - * put something like this in your code: - * - * #if !defined(NDEBUG) - * #define SIMDE_ALIGN_DEBUG - * #endif - * #include <.../simde-align.h> - */ -#if \ - HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \ - HEDLEY_GCC_VERSION_CHECK(4,7,0) - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) \ - HEDLEY_REINTERPRET_CAST(__typeof__(Pointer), __builtin_assume_aligned(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), Alignment)) -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) (__extension__ ({ \ - __typeof__(v) simde_assume_aligned_t_ = (Pointer); \ - __assume_aligned(simde_assume_aligned_t_, Alignment); \ - simde_assume_aligned_t_; \ - })) -#elif defined(__cplusplus) && (__cplusplus > 201703L) - #include - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) std::assume_aligned(Pointer) -#else - #if defined(__cplusplus) - template HEDLEY_ALWAYS_INLINE static T* simde_align_assume_to_unchecked(T* ptr, const size_t alignment) - #else - HEDLEY_ALWAYS_INLINE static void* simde_align_assume_to_unchecked(void* ptr, const size_t alignment) - #endif - { - HEDLEY_ASSUME((HEDLEY_REINTERPRET_CAST(size_t, (ptr)) % SIMDE_ALIGN_CAP(alignment)) == 0); - return ptr; - } - #if defined(__cplusplus) - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) simde_align_assume_to_unchecked((Pointer), (Alignment)) - #else - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) simde_align_assume_to_unchecked(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), (Alignment)) - #endif -#endif - -#if !defined(SIMDE_ALIGN_DEBUG) - #define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) -#else - #include - #if defined(__cplusplus) - template - static HEDLEY_ALWAYS_INLINE - T* - simde_align_assume_to_checked_uncapped(T* ptr, const size_t alignment, const char* file, int line, const char* ptrname) - #else - static HEDLEY_ALWAYS_INLINE - void* - simde_align_assume_to_checked_uncapped(void* ptr, const size_t alignment, const char* file, int line, const char* ptrname) - #endif - { - if (HEDLEY_UNLIKELY((HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_, (ptr)) % HEDLEY_STATIC_CAST(SIMDE_ALIGN_INTPTR_T_, SIMDE_ALIGN_CAP(alignment))) != 0)) { - fprintf(stderr, "%s:%d: alignment check failed for `%s' (%p %% %u == %u)\n", - file, line, ptrname, HEDLEY_REINTERPRET_CAST(const void*, ptr), - HEDLEY_STATIC_CAST(unsigned int, SIMDE_ALIGN_CAP(alignment)), - HEDLEY_STATIC_CAST(unsigned int, HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_, (ptr)) % HEDLEY_STATIC_CAST(SIMDE_ALIGN_INTPTR_T_, SIMDE_ALIGN_CAP(alignment)))); - } - - return ptr; - } - - #if defined(__cplusplus) - #define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) simde_align_assume_to_checked_uncapped((Pointer), (Alignment), __FILE__, __LINE__, #Pointer) - #else - #define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) simde_align_assume_to_checked_uncapped(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), (Alignment), __FILE__, __LINE__, #Pointer) - #endif -#endif - -/* SIMDE_ALIGN_LIKE(Type) - * SIMDE_ALIGN_LIKE_#(Type) - * - * The SIMDE_ALIGN_LIKE macros are similar to the SIMDE_ALIGN_TO macros - * except instead of an integer they take a type; basically, it's just - * a more convenient way to do something like: - * - * SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type)) - * - * The versions with a numeric suffix will fall back on using a numeric - * value in the event we can't use SIMDE_ALIGN_OF(Type). This is - * mainly for MSVC, where __declspec(align()) can't handle anything - * other than hard-coded numeric values. - */ -#if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_TO) && !defined(SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE) - #define SIMDE_ALIGN_LIKE(Type) SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type)) - #define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_LIKE(Type) - #define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_LIKE(Type) - #define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_LIKE(Type) - #define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_LIKE(Type) -#else - #define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_TO_64 - #define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_TO_32 - #define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_TO_16 - #define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_TO_8 -#endif - -/* SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) - * - * Tihs is similar to SIMDE_ALIGN_ASSUME_TO, except that it takes a - * type instead of a numeric value. */ -#if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_ASSUME_TO) - #define SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) SIMDE_ALIGN_ASSUME_TO(Pointer, SIMDE_ALIGN_OF(Type)) -#endif - -/* SIMDE_ALIGN_CAST(Type, Pointer) - * - * SIMDE_ALIGN_CAST is like C++'s reinterpret_cast, but it will try - * to silence warnings that some compilers may produce if you try - * to assign to a type with increased alignment requirements. - * - * Note that it does *not* actually attempt to tell the compiler that - * the pointer is aligned like the destination should be; that's the - * job of the next macro. This macro is necessary for stupid APIs - * like _mm_loadu_si128 where the input is a __m128i* but the function - * is specifically for data which isn't necessarily aligned to - * _Alignof(__m128i). - */ -#if HEDLEY_HAS_WARNING("-Wcast-align") || defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3,4,0) - #define SIMDE_ALIGN_CAST(Type, Pointer) (__extension__({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("GCC diagnostic ignored \"-Wcast-align\"") \ - Type simde_r_ = HEDLEY_REINTERPRET_CAST(Type, Pointer); \ - HEDLEY_DIAGNOSTIC_POP \ - simde_r_; \ - })) -#else - #define SIMDE_ALIGN_CAST(Type, Pointer) HEDLEY_REINTERPRET_CAST(Type, Pointer) -#endif - -/* SIMDE_ALIGN_ASSUME_CAST(Type, Pointer) - * - * This is sort of like a combination of a reinterpret_cast and a - * SIMDE_ALIGN_ASSUME_LIKE. It uses SIMDE_ALIGN_ASSUME_LIKE to tell - * the compiler that the pointer is aligned like the specified type - * and casts the pointer to the specified type while suppressing any - * warnings from the compiler about casting to a type with greater - * alignment requirements. - */ -#define SIMDE_ALIGN_ASSUME_CAST(Type, Pointer) SIMDE_ALIGN_ASSUME_LIKE(SIMDE_ALIGN_CAST(Type, Pointer), Type) - -#endif /* !defined(SIMDE_ALIGN_H) */ diff --git a/extern/simde/simde-arch.h b/extern/simde/simde-arch.h deleted file mode 100644 index a492d7edc..000000000 --- a/extern/simde/simde-arch.h +++ /dev/null @@ -1,622 +0,0 @@ -/* Architecture detection - * Created by Evan Nemerson - * - * To the extent possible under law, the authors have waived all - * copyright and related or neighboring rights to this code. For - * details, see the Creative Commons Zero 1.0 Universal license at - * - * - * SPDX-License-Identifier: CC0-1.0 - * - * Different compilers define different preprocessor macros for the - * same architecture. This is an attempt to provide a single - * interface which is usable on any compiler. - * - * In general, a macro named SIMDE_ARCH_* is defined for each - * architecture the CPU supports. When there are multiple possible - * versions, we try to define the macro to the target version. For - * example, if you want to check for i586+, you could do something - * like: - * - * #if defined(SIMDE_ARCH_X86) && (SIMDE_ARCH_X86 >= 5) - * ... - * #endif - * - * You could also just check that SIMDE_ARCH_X86 >= 5 without checking - * if it's defined first, but some compilers may emit a warning about - * an undefined macro being used (e.g., GCC with -Wundef). - * - * This was originally created for SIMDe - * (hence the prefix), but this - * header has no dependencies and may be used anywhere. It is - * originally based on information from - * , though it - * has been enhanced with additional information. - * - * If you improve this file, or find a bug, please file the issue at - * . If you copy this into - * your project, even if you change the prefix, please keep the links - * to SIMDe intact so others know where to report issues, submit - * enhancements, and find the latest version. */ - -#if !defined(SIMDE_ARCH_H) -#define SIMDE_ARCH_H - -#include "hedley.h" - -/* Alpha - */ -#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) -# if defined(__alpha_ev6__) -# define SIMDE_ARCH_ALPHA 6 -# elif defined(__alpha_ev5__) -# define SIMDE_ARCH_ALPHA 5 -# elif defined(__alpha_ev4__) -# define SIMDE_ARCH_ALPHA 4 -# else -# define SIMDE_ARCH_ALPHA 1 -# endif -#endif -#if defined(SIMDE_ARCH_ALPHA) -# define SIMDE_ARCH_ALPHA_CHECK(version) ((version) <= SIMDE_ARCH_ALPHA) -#else -# define SIMDE_ARCH_ALPHA_CHECK(version) (0) -#endif - -/* Atmel AVR - */ -#if defined(__AVR_ARCH__) -# define SIMDE_ARCH_AVR __AVR_ARCH__ -#endif - -/* AMD64 / x86_64 - */ -#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) -# if !defined(_M_ARM64EC) -# define SIMDE_ARCH_AMD64 1000 -# endif -#endif - -/* ARM - */ -#if defined(__ARM_ARCH) -# if __ARM_ARCH > 100 -# define SIMDE_ARCH_ARM (__ARM_ARCH) -# else -# define SIMDE_ARCH_ARM (__ARM_ARCH * 100) -# endif -#elif defined(_M_ARM) -# if _M_ARM > 100 -# define SIMDE_ARCH_ARM (_M_ARM) -# else -# define SIMDE_ARCH_ARM (_M_ARM * 100) -# endif -#elif defined(_M_ARM64) || defined(_M_ARM64EC) -# define SIMDE_ARCH_ARM 800 -#elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) || defined(_ARM) || defined(_M_ARM) || defined(_M_ARM) -# define SIMDE_ARCH_ARM 1 -#endif -#if defined(SIMDE_ARCH_ARM) -# define SIMDE_ARCH_ARM_CHECK(major, minor) (((major * 100) + (minor)) <= SIMDE_ARCH_ARM) -#else -# define SIMDE_ARCH_ARM_CHECK(major, minor) (0) -#endif - -/* AArch64 - */ -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) -# define SIMDE_ARCH_AARCH64 1000 -#endif -#if defined(SIMDE_ARCH_AARCH64) -# define SIMDE_ARCH_AARCH64_CHECK(version) ((version) <= SIMDE_ARCH_AARCH64) -#else -# define SIMDE_ARCH_AARCH64_CHECK(version) (0) -#endif - -/* ARM SIMD ISA extensions */ -#if defined(__ARM_NEON) || defined(SIMDE_ARCH_AARCH64) -# if defined(SIMDE_ARCH_AARCH64) -# define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_AARCH64 -# elif defined(SIMDE_ARCH_ARM) -# define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM -# endif -#endif -#if defined(__ARM_FEATURE_SVE) -# define SIMDE_ARCH_ARM_SVE -#endif -#if defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA -# define SIMDE_ARCH_ARM_FMA -#endif -#if defined(__ARM_FEATURE_CRYPTO) -# define SIMDE_ARCH_ARM_CRYPTO -#endif -#if defined(__ARM_FEATURE_QRDMX) -# define SIMDE_ARCH_ARM_QRDMX -#endif - -/* Blackfin - */ -#if defined(__bfin) || defined(__BFIN__) || defined(__bfin__) -# define SIMDE_ARCH_BLACKFIN 1 -#endif - -/* CRIS - */ -#if defined(__CRIS_arch_version) -# define SIMDE_ARCH_CRIS __CRIS_arch_version -#elif defined(__cris__) || defined(__cris) || defined(__CRIS) || defined(__CRIS__) -# define SIMDE_ARCH_CRIS 1 -#endif - -/* Convex - */ -#if defined(__convex_c38__) -# define SIMDE_ARCH_CONVEX 38 -#elif defined(__convex_c34__) -# define SIMDE_ARCH_CONVEX 34 -#elif defined(__convex_c32__) -# define SIMDE_ARCH_CONVEX 32 -#elif defined(__convex_c2__) -# define SIMDE_ARCH_CONVEX 2 -#elif defined(__convex__) -# define SIMDE_ARCH_CONVEX 1 -#endif -#if defined(SIMDE_ARCH_CONVEX) -# define SIMDE_ARCH_CONVEX_CHECK(version) ((version) <= SIMDE_ARCH_CONVEX) -#else -# define SIMDE_ARCH_CONVEX_CHECK(version) (0) -#endif - -/* Adapteva Epiphany - */ -#if defined(__epiphany__) -# define SIMDE_ARCH_EPIPHANY 1 -#endif - -/* Fujitsu FR-V - */ -#if defined(__frv__) -# define SIMDE_ARCH_FRV 1 -#endif - -/* H8/300 - */ -#if defined(__H8300__) -# define SIMDE_ARCH_H8300 -#endif - -/* Elbrus (8S, 8SV and successors) - */ -#if defined(__e2k__) -# define SIMDE_ARCH_E2K -#endif - -/* HP/PA / PA-RISC - */ -#if defined(__PA8000__) || defined(__HPPA20__) || defined(__RISC2_0__) || defined(_PA_RISC2_0) -# define SIMDE_ARCH_HPPA 20 -#elif defined(__PA7100__) || defined(__HPPA11__) || defined(_PA_RISC1_1) -# define SIMDE_ARCH_HPPA 11 -#elif defined(_PA_RISC1_0) -# define SIMDE_ARCH_HPPA 10 -#elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) -# define SIMDE_ARCH_HPPA 1 -#endif -#if defined(SIMDE_ARCH_HPPA) -# define SIMDE_ARCH_HPPA_CHECK(version) ((version) <= SIMDE_ARCH_HPPA) -#else -# define SIMDE_ARCH_HPPA_CHECK(version) (0) -#endif - -/* x86 - */ -#if defined(_M_IX86) -# define SIMDE_ARCH_X86 (_M_IX86 / 100) -#elif defined(__I86__) -# define SIMDE_ARCH_X86 __I86__ -#elif defined(i686) || defined(__i686) || defined(__i686__) -# define SIMDE_ARCH_X86 6 -#elif defined(i586) || defined(__i586) || defined(__i586__) -# define SIMDE_ARCH_X86 5 -#elif defined(i486) || defined(__i486) || defined(__i486__) -# define SIMDE_ARCH_X86 4 -#elif defined(i386) || defined(__i386) || defined(__i386__) -# define SIMDE_ARCH_X86 3 -#elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__) -# define SIMDE_ARCH_X86 3 -#endif -#if defined(SIMDE_ARCH_X86) -# define SIMDE_ARCH_X86_CHECK(version) ((version) <= SIMDE_ARCH_X86) -#else -# define SIMDE_ARCH_X86_CHECK(version) (0) -#endif - -/* SIMD ISA extensions for x86/x86_64 and Elbrus */ -#if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) || defined(SIMDE_ARCH_E2K) -# if defined(_M_IX86_FP) -# define SIMDE_ARCH_X86_MMX -# if (_M_IX86_FP >= 1) -# define SIMDE_ARCH_X86_SSE 1 -# endif -# if (_M_IX86_FP >= 2) -# define SIMDE_ARCH_X86_SSE2 1 -# endif -# elif defined(_M_X64) -# define SIMDE_ARCH_X86_SSE 1 -# define SIMDE_ARCH_X86_SSE2 1 -# else -# if defined(__MMX__) -# define SIMDE_ARCH_X86_MMX 1 -# endif -# if defined(__SSE__) -# define SIMDE_ARCH_X86_SSE 1 -# endif -# if defined(__SSE2__) -# define SIMDE_ARCH_X86_SSE2 1 -# endif -# endif -# if defined(__SSE3__) -# define SIMDE_ARCH_X86_SSE3 1 -# endif -# if defined(__SSSE3__) -# define SIMDE_ARCH_X86_SSSE3 1 -# endif -# if defined(__SSE4_1__) -# define SIMDE_ARCH_X86_SSE4_1 1 -# endif -# if defined(__SSE4_2__) -# define SIMDE_ARCH_X86_SSE4_2 1 -# endif -# if defined(__XOP__) -# define SIMDE_ARCH_X86_XOP 1 -# endif -# if defined(__AVX__) -# define SIMDE_ARCH_X86_AVX 1 -# if !defined(SIMDE_ARCH_X86_SSE3) -# define SIMDE_ARCH_X86_SSE3 1 -# endif -# if !defined(SIMDE_ARCH_X86_SSE4_1) -# define SIMDE_ARCH_X86_SSE4_1 1 -# endif -# if !defined(SIMDE_ARCH_X86_SSE4_2) -# define SIMDE_ARCH_X86_SSE4_2 1 -# endif -# endif -# if defined(__AVX2__) -# define SIMDE_ARCH_X86_AVX2 1 -# if defined(_MSC_VER) -# define SIMDE_ARCH_X86_FMA 1 -# endif -# endif -# if defined(__FMA__) -# define SIMDE_ARCH_X86_FMA 1 -# if !defined(SIMDE_ARCH_X86_AVX) -# define SIMDE_ARCH_X86_AVX 1 -# endif -# endif -# if defined(__AVX512VP2INTERSECT__) -# define SIMDE_ARCH_X86_AVX512VP2INTERSECT 1 -# endif -# if defined(__AVX512BITALG__) -# define SIMDE_ARCH_X86_AVX512BITALG 1 -# endif -# if defined(__AVX512VPOPCNTDQ__) -# define SIMDE_ARCH_X86_AVX512VPOPCNTDQ 1 -# endif -# if defined(__AVX512VBMI__) -# define SIMDE_ARCH_X86_AVX512VBMI 1 -# endif -# if defined(__AVX512VBMI2__) -# define SIMDE_ARCH_X86_AVX512VBMI2 1 -# endif -# if defined(__AVX512VNNI__) -# define SIMDE_ARCH_X86_AVX512VNNI 1 -# endif -# if defined(__AVX5124VNNIW__) -# define SIMDE_ARCH_X86_AVX5124VNNIW 1 -# endif -# if defined(__AVX512BW__) -# define SIMDE_ARCH_X86_AVX512BW 1 -# endif -# if defined(__AVX512BF16__) -# define SIMDE_ARCH_X86_AVX512BF16 1 -# endif -# if defined(__AVX512CD__) -# define SIMDE_ARCH_X86_AVX512CD 1 -# endif -# if defined(__AVX512DQ__) -# define SIMDE_ARCH_X86_AVX512DQ 1 -# endif -# if defined(__AVX512F__) -# define SIMDE_ARCH_X86_AVX512F 1 -# endif -# if defined(__AVX512VL__) -# define SIMDE_ARCH_X86_AVX512VL 1 -# endif -# if defined(__AVX512FP16__) -# define SIMDE_ARCH_X86_AVX512FP16 1 -# endif -# if defined(__GFNI__) -# define SIMDE_ARCH_X86_GFNI 1 -# endif -# if defined(__PCLMUL__) -# define SIMDE_ARCH_X86_PCLMUL 1 -# endif -# if defined(__VPCLMULQDQ__) -# define SIMDE_ARCH_X86_VPCLMULQDQ 1 -# endif -# if defined(__F16C__) || (defined(HEDLEY_MSVC_VERSION) && HEDLEY_MSVC_VERSION_CHECK(19,30,0) && defined(SIMDE_ARCH_X86_AVX2) ) -# define SIMDE_ARCH_X86_F16C 1 -# endif -# if defined(__AES__) -# define SIMDE_ARCH_X86_AES 1 -# endif -#endif - -/* Itanium - */ -#if defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || defined(__itanium__) -# define SIMDE_ARCH_IA64 1 -#endif - -/* Renesas M32R - */ -#if defined(__m32r__) || defined(__M32R__) -# define SIMDE_ARCH_M32R -#endif - -/* Motorola 68000 - */ -#if defined(__mc68060__) || defined(__MC68060__) -# define SIMDE_ARCH_M68K 68060 -#elif defined(__mc68040__) || defined(__MC68040__) -# define SIMDE_ARCH_M68K 68040 -#elif defined(__mc68030__) || defined(__MC68030__) -# define SIMDE_ARCH_M68K 68030 -#elif defined(__mc68020__) || defined(__MC68020__) -# define SIMDE_ARCH_M68K 68020 -#elif defined(__mc68010__) || defined(__MC68010__) -# define SIMDE_ARCH_M68K 68010 -#elif defined(__mc68000__) || defined(__MC68000__) -# define SIMDE_ARCH_M68K 68000 -#endif -#if defined(SIMDE_ARCH_M68K) -# define SIMDE_ARCH_M68K_CHECK(version) ((version) <= SIMDE_ARCH_M68K) -#else -# define SIMDE_ARCH_M68K_CHECK(version) (0) -#endif - -/* Xilinx MicroBlaze - */ -#if defined(__MICROBLAZE__) || defined(__microblaze__) -# define SIMDE_ARCH_MICROBLAZE -#endif - -/* MIPS - */ -#if defined(_MIPS_ISA_MIPS64R2) -# define SIMDE_ARCH_MIPS 642 -#elif defined(_MIPS_ISA_MIPS64) -# define SIMDE_ARCH_MIPS 640 -#elif defined(_MIPS_ISA_MIPS32R2) -# define SIMDE_ARCH_MIPS 322 -#elif defined(_MIPS_ISA_MIPS32) -# define SIMDE_ARCH_MIPS 320 -#elif defined(_MIPS_ISA_MIPS4) -# define SIMDE_ARCH_MIPS 4 -#elif defined(_MIPS_ISA_MIPS3) -# define SIMDE_ARCH_MIPS 3 -#elif defined(_MIPS_ISA_MIPS2) -# define SIMDE_ARCH_MIPS 2 -#elif defined(_MIPS_ISA_MIPS1) -# define SIMDE_ARCH_MIPS 1 -#elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__) -# define SIMDE_ARCH_MIPS 1 -#endif -#if defined(SIMDE_ARCH_MIPS) -# define SIMDE_ARCH_MIPS_CHECK(version) ((version) <= SIMDE_ARCH_MIPS) -#else -# define SIMDE_ARCH_MIPS_CHECK(version) (0) -#endif - -#if defined(__mips_loongson_mmi) -# define SIMDE_ARCH_MIPS_LOONGSON_MMI 1 -#endif - -#if defined(__mips_msa) -# define SIMDE_ARCH_MIPS_MSA 1 -#endif - -/* Matsushita MN10300 - */ -#if defined(__MN10300__) || defined(__mn10300__) -# define SIMDE_ARCH_MN10300 1 -#endif - -/* POWER - */ -#if defined(_M_PPC) -# define SIMDE_ARCH_POWER _M_PPC -#elif defined(_ARCH_PWR9) -# define SIMDE_ARCH_POWER 900 -#elif defined(_ARCH_PWR8) -# define SIMDE_ARCH_POWER 800 -#elif defined(_ARCH_PWR7) -# define SIMDE_ARCH_POWER 700 -#elif defined(_ARCH_PWR6) -# define SIMDE_ARCH_POWER 600 -#elif defined(_ARCH_PWR5) -# define SIMDE_ARCH_POWER 500 -#elif defined(_ARCH_PWR4) -# define SIMDE_ARCH_POWER 400 -#elif defined(_ARCH_440) || defined(__ppc440__) -# define SIMDE_ARCH_POWER 440 -#elif defined(_ARCH_450) || defined(__ppc450__) -# define SIMDE_ARCH_POWER 450 -#elif defined(_ARCH_601) || defined(__ppc601__) -# define SIMDE_ARCH_POWER 601 -#elif defined(_ARCH_603) || defined(__ppc603__) -# define SIMDE_ARCH_POWER 603 -#elif defined(_ARCH_604) || defined(__ppc604__) -# define SIMDE_ARCH_POWER 604 -#elif defined(_ARCH_605) || defined(__ppc605__) -# define SIMDE_ARCH_POWER 605 -#elif defined(_ARCH_620) || defined(__ppc620__) -# define SIMDE_ARCH_POWER 620 -#elif defined(__powerpc) || defined(__powerpc__) || defined(__POWERPC__) || defined(__ppc__) || defined(__PPC__) || defined(_ARCH_PPC) || defined(__ppc) -# define SIMDE_ARCH_POWER 1 -#endif -#if defined(SIMDE_ARCH_POWER) - #define SIMDE_ARCH_POWER_CHECK(version) ((version) <= SIMDE_ARCH_POWER) -#else - #define SIMDE_ARCH_POWER_CHECK(version) (0) -#endif - -#if defined(__ALTIVEC__) -# define SIMDE_ARCH_POWER_ALTIVEC SIMDE_ARCH_POWER - #define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) ((version) <= SIMDE_ARCH_POWER) -#else - #define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) (0) -#endif - -#if defined(__riscv) && __riscv_xlen==64 -# define SIMDE_ARCH_RISCV64 -#endif - -/* SPARC - */ -#if defined(__sparc_v9__) || defined(__sparcv9) -# define SIMDE_ARCH_SPARC 9 -#elif defined(__sparc_v8__) || defined(__sparcv8) -# define SIMDE_ARCH_SPARC 8 -#elif defined(__sparc_v7__) || defined(__sparcv7) -# define SIMDE_ARCH_SPARC 7 -#elif defined(__sparc_v6__) || defined(__sparcv6) -# define SIMDE_ARCH_SPARC 6 -#elif defined(__sparc_v5__) || defined(__sparcv5) -# define SIMDE_ARCH_SPARC 5 -#elif defined(__sparc_v4__) || defined(__sparcv4) -# define SIMDE_ARCH_SPARC 4 -#elif defined(__sparc_v3__) || defined(__sparcv3) -# define SIMDE_ARCH_SPARC 3 -#elif defined(__sparc_v2__) || defined(__sparcv2) -# define SIMDE_ARCH_SPARC 2 -#elif defined(__sparc_v1__) || defined(__sparcv1) -# define SIMDE_ARCH_SPARC 1 -#elif defined(__sparc__) || defined(__sparc) -# define SIMDE_ARCH_SPARC 1 -#endif -#if defined(SIMDE_ARCH_SPARC) - #define SIMDE_ARCH_SPARC_CHECK(version) ((version) <= SIMDE_ARCH_SPARC) -#else - #define SIMDE_ARCH_SPARC_CHECK(version) (0) -#endif - -/* SuperH - */ -#if defined(__sh5__) || defined(__SH5__) -# define SIMDE_ARCH_SUPERH 5 -#elif defined(__sh4__) || defined(__SH4__) -# define SIMDE_ARCH_SUPERH 4 -#elif defined(__sh3__) || defined(__SH3__) -# define SIMDE_ARCH_SUPERH 3 -#elif defined(__sh2__) || defined(__SH2__) -# define SIMDE_ARCH_SUPERH 2 -#elif defined(__sh1__) || defined(__SH1__) -# define SIMDE_ARCH_SUPERH 1 -#elif defined(__sh__) || defined(__SH__) -# define SIMDE_ARCH_SUPERH 1 -#endif - -/* IBM System z - */ -#if defined(__370__) || defined(__THW_370__) || defined(__s390__) || defined(__s390x__) || defined(__zarch__) || defined(__SYSC_ZARCH__) -# define SIMDE_ARCH_ZARCH __ARCH__ -#endif -#if defined(SIMDE_ARCH_ZARCH) - #define SIMDE_ARCH_ZARCH_CHECK(version) ((version) <= SIMDE_ARCH_ZARCH) -#else - #define SIMDE_ARCH_ZARCH_CHECK(version) (0) -#endif - -#if defined(SIMDE_ARCH_ZARCH) && defined(__VEC__) - #define SIMDE_ARCH_ZARCH_ZVECTOR SIMDE_ARCH_ZARCH -#endif - -/* TMS320 DSP - */ -#if defined(_TMS320C6740) || defined(__TMS320C6740__) -# define SIMDE_ARCH_TMS320 6740 -#elif defined(_TMS320C6700_PLUS) || defined(__TMS320C6700_PLUS__) -# define SIMDE_ARCH_TMS320 6701 -#elif defined(_TMS320C6700) || defined(__TMS320C6700__) -# define SIMDE_ARCH_TMS320 6700 -#elif defined(_TMS320C6600) || defined(__TMS320C6600__) -# define SIMDE_ARCH_TMS320 6600 -#elif defined(_TMS320C6400_PLUS) || defined(__TMS320C6400_PLUS__) -# define SIMDE_ARCH_TMS320 6401 -#elif defined(_TMS320C6400) || defined(__TMS320C6400__) -# define SIMDE_ARCH_TMS320 6400 -#elif defined(_TMS320C6200) || defined(__TMS320C6200__) -# define SIMDE_ARCH_TMS320 6200 -#elif defined(_TMS320C55X) || defined(__TMS320C55X__) -# define SIMDE_ARCH_TMS320 550 -#elif defined(_TMS320C54X) || defined(__TMS320C54X__) -# define SIMDE_ARCH_TMS320 540 -#elif defined(_TMS320C28X) || defined(__TMS320C28X__) -# define SIMDE_ARCH_TMS320 280 -#endif -#if defined(SIMDE_ARCH_TMS320) - #define SIMDE_ARCH_TMS320_CHECK(version) ((version) <= SIMDE_ARCH_TMS320) -#else - #define SIMDE_ARCH_TMS320_CHECK(version) (0) -#endif - -/* WebAssembly */ -#if defined(__wasm__) -# define SIMDE_ARCH_WASM 1 -#endif - -#if defined(SIMDE_ARCH_WASM) && defined(__wasm_simd128__) -# define SIMDE_ARCH_WASM_SIMD128 -#endif - -#if defined(SIMDE_ARCH_WASM) && defined(__wasm_relaxed_simd__) -# define SIMDE_ARCH_WASM_RELAXED_SIMD -#endif - -/* Xtensa - */ -#if defined(__xtensa__) || defined(__XTENSA__) -# define SIMDE_ARCH_XTENSA 1 -#endif - -/* Availability of 16-bit floating-point arithmetic intrinsics */ -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -# define SIMDE_ARCH_ARM_NEON_FP16 -#endif - -/* Availability of 16-bit brain floating-point arithmetic intrinsics */ -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) -# define SIMDE_ARCH_ARM_NEON_BF16 -#endif - -/* LoongArch - */ -#if defined(__loongarch32) -# define SIMDE_ARCH_LOONGARCH 1 -#elif defined(__loongarch64) -# define SIMDE_ARCH_LOONGARCH 2 -#endif - -/* LSX: LoongArch 128-bits SIMD extension */ -#if defined(__loongarch_sx) -# define SIMDE_ARCH_LOONGARCH_LSX 1 -#endif - -/* LASX: LoongArch 256-bits SIMD extension */ -#if defined(__loongarch_asx) -# define SIMDE_ARCH_LOONGARCH_LASX 2 -#endif - -#endif /* !defined(SIMDE_ARCH_H) */ diff --git a/extern/simde/simde-common.h b/extern/simde/simde-common.h deleted file mode 100644 index d316255d9..000000000 --- a/extern/simde/simde-common.h +++ /dev/null @@ -1,1194 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) - */ - -#if !defined(SIMDE_COMMON_H) -#define SIMDE_COMMON_H - -#include "hedley.h" - -#define SIMDE_VERSION_MAJOR 0 -#define SIMDE_VERSION_MINOR 8 -#define SIMDE_VERSION_MICRO 0 -#define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO) -// Also update meson.build in the root directory of the repository - -#include -#include - -#include "simde-detect-clang.h" -#include "simde-arch.h" -#include "simde-features.h" -#include "simde-diagnostic.h" -#include "simde-math.h" -#include "simde-constify.h" -#include "simde-align.h" - -/* In some situations, SIMDe has to make large performance sacrifices - * for small increases in how faithfully it reproduces an API, but - * only a relatively small number of users will actually need the API - * to be completely accurate. The SIMDE_FAST_* options can be used to - * disable these trade-offs. - * - * They can be enabled by passing -DSIMDE_FAST_MATH to the compiler, or - * the individual defines (e.g., -DSIMDE_FAST_NANS) if you only want to - * enable some optimizations. Using -ffast-math and/or - * -ffinite-math-only will also enable the relevant options. If you - * don't want that you can pass -DSIMDE_NO_FAST_* to disable them. */ - -/* Most programs avoid NaNs by never passing values which can result in - * a NaN; for example, if you only pass non-negative values to the sqrt - * functions, it won't generate a NaN. On some platforms, similar - * functions handle NaNs differently; for example, the _mm_min_ps SSE - * function will return 0.0 if you pass it (0.0, NaN), but the NEON - * vminq_f32 function will return NaN. Making them behave like one - * another is expensive; it requires generating a mask of all lanes - * with NaNs, then performing the operation (e.g., vminq_f32), then - * blending together the result with another vector using the mask. - * - * If you don't want SIMDe to worry about the differences between how - * NaNs are handled on the two platforms, define this (or pass - * -ffinite-math-only) */ -#if !defined(SIMDE_FAST_MATH) && !defined(SIMDE_NO_FAST_MATH) && defined(__FAST_MATH__) - #define SIMDE_FAST_MATH -#endif - -#if !defined(SIMDE_FAST_NANS) && !defined(SIMDE_NO_FAST_NANS) - #if defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_NANS - #elif defined(__FINITE_MATH_ONLY__) - #if __FINITE_MATH_ONLY__ - #define SIMDE_FAST_NANS - #endif - #endif -#endif - -/* Many functions are defined as using the current rounding mode - * (i.e., the SIMD version of fegetround()) when converting to - * an integer. For example, _mm_cvtpd_epi32. Unfortunately, - * on some platforms (such as ARMv8+ where round-to-nearest is - * always used, regardless of the FPSCR register) this means we - * have to first query the current rounding mode, then choose - * the proper function (rounnd - , ceil, floor, etc.) */ -#if !defined(SIMDE_FAST_ROUND_MODE) && !defined(SIMDE_NO_FAST_ROUND_MODE) && defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_ROUND_MODE -#endif - -/* This controls how ties are rounded. For example, does 10.5 round to - * 10 or 11? IEEE 754 specifies round-towards-even, but ARMv7 (for - * example) doesn't support it and it must be emulated (which is rather - * slow). If you're okay with just using the default for whatever arch - * you're on, you should definitely define this. - * - * Note that we don't use this macro to avoid correct implementations - * in functions which are explicitly about rounding (such as vrnd* on - * NEON, _mm_round_* on x86, etc.); it is only used for code where - * rounding is a component in another function, and even then it isn't - * usually a problem since such functions will use the current rounding - * mode. */ -#if !defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_NO_FAST_ROUND_TIES) && defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_ROUND_TIES -#endif - -/* For functions which convert from one type to another (mostly from - * floating point to integer types), sometimes we need to do a range - * check and potentially return a different result if the value - * falls outside that range. Skipping this check can provide a - * performance boost, at the expense of faithfulness to the API we're - * emulating. */ -#if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_NO_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_CONVERSION_RANGE -#endif - -/* Due to differences across platforms, sometimes it can be much - * faster for us to allow spurious floating point exceptions, - * or to no generate them when we should. */ -#if !defined(SIMDE_FAST_EXCEPTIONS) && !defined(SIMDE_NO_FAST_EXCEPTIONS) && defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_EXCEPTIONS -#endif - -#if \ - HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define SIMDE_CHECK_CONSTANT_(expr) (__builtin_constant_p(expr)) -#elif defined(__cplusplus) && (__cplusplus > 201703L) - #include - #define SIMDE_CHECK_CONSTANT_(expr) (std::is_constant_evaluated()) -#endif - -#if !defined(SIMDE_NO_CHECK_IMMEDIATE_CONSTANT) - #if defined(SIMDE_CHECK_CONSTANT_) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && \ - (!defined(__apple_build_version__) || ((__apple_build_version__ < 11000000) || (__apple_build_version__ >= 12000000))) - #define SIMDE_REQUIRE_CONSTANT(arg) HEDLEY_REQUIRE_MSG(SIMDE_CHECK_CONSTANT_(arg), "`" #arg "' must be constant") - #else - #define SIMDE_REQUIRE_CONSTANT(arg) - #endif -#else - #define SIMDE_REQUIRE_CONSTANT(arg) -#endif - -#define SIMDE_REQUIRE_RANGE(arg, min, max) \ - HEDLEY_REQUIRE_MSG((((arg) >= (min)) && ((arg) <= (max))), "'" #arg "' must be in [" #min ", " #max "]") - -#define SIMDE_REQUIRE_CONSTANT_RANGE(arg, min, max) \ - SIMDE_REQUIRE_CONSTANT(arg) \ - SIMDE_REQUIRE_RANGE(arg, min, max) - -/* A copy of HEDLEY_STATIC_ASSERT, except we don't define an empty - * fallback if we can't find an implementation; instead we have to - * check if SIMDE_STATIC_ASSERT is defined before using it. */ -#if \ - !defined(__cplusplus) && ( \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ - HEDLEY_HAS_FEATURE(c_static_assert) || \ - HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - defined(_Static_assert) \ - ) - /* Sometimes _Static_assert is defined (in cdefs.h) using a symbol which - * starts with a double-underscore. This is a system header so we have no - * control over it, but since it's a macro it will emit a diagnostic which - * prevents compilation with -Werror. */ - #if HEDLEY_HAS_WARNING("-Wreserved-identifier") - #define SIMDE_STATIC_ASSERT(expr, message) (__extension__({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wreserved-identifier\"") \ - _Static_assert(expr, message); \ - HEDLEY_DIAGNOSTIC_POP \ - })) - #else - #define SIMDE_STATIC_ASSERT(expr, message) _Static_assert(expr, message) - #endif -#elif \ - (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ - HEDLEY_MSVC_VERSION_CHECK(16,0,0) - #define SIMDE_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) -#endif - -/* Statement exprs */ -#if \ - HEDLEY_GNUC_VERSION_CHECK(2,95,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,26) || \ - HEDLEY_INTEL_VERSION_CHECK(9,0,0) || \ - HEDLEY_PGI_VERSION_CHECK(18,10,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) || \ - HEDLEY_IBM_VERSION_CHECK(11,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define SIMDE_STATEMENT_EXPR_(expr) (__extension__ expr) -#endif - -/* This is just a convenience macro to make it easy to call a single - * function with a specific diagnostic disabled. */ -#if defined(SIMDE_STATEMENT_EXPR_) - #define SIMDE_DISABLE_DIAGNOSTIC_EXPR_(diagnostic, expr) \ - SIMDE_STATEMENT_EXPR_(({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - diagnostic \ - (expr); \ - HEDLEY_DIAGNOSTIC_POP \ - })) -#endif - -#if defined(SIMDE_CHECK_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) - #define SIMDE_ASSERT_CONSTANT_(v) SIMDE_STATIC_ASSERT(SIMDE_CHECK_CONSTANT_(v), #v " must be constant.") -#endif - -#if \ - (HEDLEY_HAS_ATTRIBUTE(may_alias) && !defined(HEDLEY_SUNPRO_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) -# define SIMDE_MAY_ALIAS __attribute__((__may_alias__)) -#else -# define SIMDE_MAY_ALIAS -#endif - -/* Lots of compilers support GCC-style vector extensions, but many - don't support all the features. Define different macros depending - on support for - - * SIMDE_VECTOR - Declaring a vector. - * SIMDE_VECTOR_OPS - basic operations (binary and unary). - * SIMDE_VECTOR_NEGATE - negating a vector - * SIMDE_VECTOR_SCALAR - For binary operators, the second argument - can be a scalar, in which case the result is as if that scalar - had been broadcast to all lanes of a vector. - * SIMDE_VECTOR_SUBSCRIPT - Supports array subscript notation for - extracting/inserting a single element.= - - SIMDE_VECTOR can be assumed if any others are defined, the - others are independent. */ -#if !defined(SIMDE_NO_VECTOR) -# if \ - HEDLEY_GCC_VERSION_CHECK(4,8,0) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# define SIMDE_VECTOR_OPS -# define SIMDE_VECTOR_NEGATE -# define SIMDE_VECTOR_SCALAR -# define SIMDE_VECTOR_SUBSCRIPT -# elif HEDLEY_INTEL_VERSION_CHECK(16,0,0) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# define SIMDE_VECTOR_OPS -# define SIMDE_VECTOR_NEGATE -/* ICC only supports SIMDE_VECTOR_SCALAR for constants */ -# define SIMDE_VECTOR_SUBSCRIPT -# elif \ - HEDLEY_GCC_VERSION_CHECK(4,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# define SIMDE_VECTOR_OPS -# elif HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# elif HEDLEY_HAS_ATTRIBUTE(vector_size) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# define SIMDE_VECTOR_OPS -# define SIMDE_VECTOR_NEGATE -# define SIMDE_VECTOR_SUBSCRIPT -# if SIMDE_DETECT_CLANG_VERSION_CHECK(5,0,0) -# define SIMDE_VECTOR_SCALAR -# endif -# endif - -/* GCC and clang have built-in functions to handle shuffling and - converting of vectors, but the implementations are slightly - different. This macro is just an abstraction over them. Note that - elem_size is in bits but vec_size is in bytes. */ -# if !defined(SIMDE_NO_SHUFFLE_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT) - HEDLEY_DIAGNOSTIC_PUSH - /* We don't care about -Wvariadic-macros; all compilers that support - * shufflevector/shuffle support them. */ -# if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") -# pragma clang diagnostic ignored "-Wc++98-compat-pedantic" -# endif -# if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4,0,0) -# pragma GCC diagnostic ignored "-Wvariadic-macros" -# endif - -# if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) -# define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__) -# elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle,4,7,0) && !defined(__INTEL_COMPILER) -# define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) (__extension__ ({ \ - int##elem_size##_t SIMDE_VECTOR(vec_size) simde_shuffle_ = { __VA_ARGS__ }; \ - __builtin_shuffle(a, b, simde_shuffle_); \ - })) -# endif - HEDLEY_DIAGNOSTIC_POP -# endif - -/* TODO: this actually works on XL C/C++ without SIMDE_VECTOR_SUBSCRIPT - but the code needs to be refactored a bit to take advantage. */ -# if !defined(SIMDE_NO_CONVERT_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT) -# if HEDLEY_HAS_BUILTIN(__builtin_convertvector) || HEDLEY_GCC_VERSION_CHECK(9,0,0) -# if HEDLEY_GCC_VERSION_CHECK(9,0,0) && !HEDLEY_GCC_VERSION_CHECK(9,3,0) - /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */ -# define SIMDE_CONVERT_VECTOR_(to, from) ((to) = (__extension__({ \ - __typeof__(from) from_ = (from); \ - ((void) from_); \ - __builtin_convertvector(from_, __typeof__(to)); \ - }))) -# else -# define SIMDE_CONVERT_VECTOR_(to, from) ((to) = __builtin_convertvector((from), __typeof__(to))) -# endif -# endif -# endif -#endif - -/* Since we currently require SUBSCRIPT before using a vector in a - union, we define these as dependencies of SUBSCRIPT. They are - likely to disappear in the future, once SIMDe learns how to make - use of vectors without using the union members. Do not use them - in your code unless you're okay with it breaking when SIMDe - changes. */ -#if defined(SIMDE_VECTOR_SUBSCRIPT) -# if defined(SIMDE_VECTOR_OPS) -# define SIMDE_VECTOR_SUBSCRIPT_OPS -# endif -# if defined(SIMDE_VECTOR_SCALAR) -# define SIMDE_VECTOR_SUBSCRIPT_SCALAR -# endif -#endif - -#if !defined(SIMDE_DISABLE_OPENMP) - #if !defined(SIMDE_ENABLE_OPENMP) && ((defined(_OPENMP) && (_OPENMP >= 201307L)) || (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L))) || defined(HEDLEY_MCST_LCC_VERSION) - #define SIMDE_ENABLE_OPENMP - #endif -#endif - -#if !defined(SIMDE_ENABLE_CILKPLUS) && (defined(__cilk) || defined(HEDLEY_INTEL_VERSION)) -# define SIMDE_ENABLE_CILKPLUS -#endif - -#if defined(SIMDE_ENABLE_OPENMP) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(omp simd) -# define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l)) -# if defined(__clang__) -# define SIMDE_VECTORIZE_REDUCTION(r) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wsign-conversion\"") \ - HEDLEY_PRAGMA(omp simd reduction(r)) \ - HEDLEY_DIAGNOSTIC_POP -# else -# define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r)) -# endif -# if !defined(HEDLEY_MCST_LCC_VERSION) -# define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a)) -# else -# define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd) -# endif -#elif defined(SIMDE_ENABLE_CILKPLUS) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(simd) -# define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) -# define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) -# define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a)) -#elif defined(__clang__) && !defined(HEDLEY_IBM_VERSION) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(clang loop vectorize(enable)) -# define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l)) -# define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_ALIGNED(a) -#elif HEDLEY_GCC_VERSION_CHECK(4,9,0) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(GCC ivdep) -# define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_ALIGNED(a) -#elif HEDLEY_CRAY_VERSION_CHECK(5,0,0) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(_CRI ivdep) -# define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_ALIGNED(a) -#else -# define SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_SAFELEN(l) -# define SIMDE_VECTORIZE_REDUCTION(r) -# define SIMDE_VECTORIZE_ALIGNED(a) -#endif - -#define SIMDE_MASK_NZ_(v, mask) (((v) & (mask)) | !((v) & (mask))) - -/* Intended for checking coverage, you should never use this in - production. */ -#if defined(SIMDE_NO_INLINE) -# define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE static -#else -# define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_ALWAYS_INLINE static -#endif - -#if defined(SIMDE_NO_INLINE) -# define SIMDE_HUGE_FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE static -#elif defined(SIMDE_CONSTRAINED_COMPILATION) -# define SIMDE_HUGE_FUNCTION_ATTRIBUTES static -#else -# define SIMDE_HUGE_FUNCTION_ATTRIBUTES HEDLEY_ALWAYS_INLINE static -#endif - -#if \ - HEDLEY_HAS_ATTRIBUTE(unused) || \ - HEDLEY_GCC_VERSION_CHECK(2,95,0) -# define SIMDE_FUNCTION_POSSIBLY_UNUSED_ __attribute__((__unused__)) -#else -# define SIMDE_FUNCTION_POSSIBLY_UNUSED_ -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ - -#if defined(_MSC_VER) -# define SIMDE_BEGIN_DECLS_ HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable:4996 4204)) HEDLEY_BEGIN_C_DECLS -# define SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS -#else -# define SIMDE_BEGIN_DECLS_ \ - HEDLEY_DIAGNOSTIC_PUSH \ - SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ \ - HEDLEY_BEGIN_C_DECLS -# define SIMDE_END_DECLS_ \ - HEDLEY_END_C_DECLS \ - HEDLEY_DIAGNOSTIC_POP -#endif - -#if defined(__SIZEOF_INT128__) -# define SIMDE_HAVE_INT128_ -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ -typedef __int128 simde_int128; -typedef unsigned __int128 simde_uint128; -HEDLEY_DIAGNOSTIC_POP -#endif - -#if !defined(SIMDE_ENDIAN_LITTLE) -# define SIMDE_ENDIAN_LITTLE 1234 -#endif -#if !defined(SIMDE_ENDIAN_BIG) -# define SIMDE_ENDIAN_BIG 4321 -#endif - -#if !defined(SIMDE_ENDIAN_ORDER) -/* GCC (and compilers masquerading as GCC) define __BYTE_ORDER__. */ -# if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -/* TI defines _BIG_ENDIAN or _LITTLE_ENDIAN */ -# elif defined(_BIG_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# elif defined(_LITTLE_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -/* We know the endianness of some common architectures. Common - * architectures not listed (ARM, POWER, MIPS, etc.) here are - * bi-endian. */ -# elif defined(__amd64) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__s390x__) || defined(__zarch__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -/* Looks like we'll have to rely on the platform. If we're missing a - * platform, please let us know. */ -# elif defined(_WIN32) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(sun) || defined(__sun) /* Solaris */ -# include -# if defined(_LITTLE_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(_BIG_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# endif -# elif defined(__APPLE__) -# include -# if defined(__LITTLE_ENDIAN__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__BIG_ENDIAN__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# endif -# elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) || defined(__DragonFly__) || defined(BSD) -# include -# if defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# endif -# elif defined(__linux__) || defined(__linux) || defined(__gnu_linux__) -# include -# if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# endif -# endif -#endif - -#if \ - HEDLEY_HAS_BUILTIN(__builtin_bswap64) || \ - HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define simde_bswap64(v) __builtin_bswap64(v) -#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) - #define simde_bswap64(v) _byteswap_uint64(v) -#else - SIMDE_FUNCTION_ATTRIBUTES - uint64_t - simde_bswap64(uint64_t v) { - return - ((v & (((uint64_t) 0xff) << 56)) >> 56) | - ((v & (((uint64_t) 0xff) << 48)) >> 40) | - ((v & (((uint64_t) 0xff) << 40)) >> 24) | - ((v & (((uint64_t) 0xff) << 32)) >> 8) | - ((v & (((uint64_t) 0xff) << 24)) << 8) | - ((v & (((uint64_t) 0xff) << 16)) << 24) | - ((v & (((uint64_t) 0xff) << 8)) << 40) | - ((v & (((uint64_t) 0xff) )) << 56); - } -#endif - -#if !defined(SIMDE_ENDIAN_ORDER) -# error Unknown byte order; please file a bug -#else -# if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE -# define simde_endian_bswap64_be(value) simde_bswap64(value) -# define simde_endian_bswap64_le(value) (value) -# elif SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG -# define simde_endian_bswap64_be(value) (value) -# define simde_endian_bswap64_le(value) simde_bswap64(value) -# endif -#endif - -/* TODO: we should at least make an attempt to detect the correct - types for simde_float32/float64 instead of just assuming float and - double. */ - -#if !defined(SIMDE_FLOAT32_TYPE) -# define SIMDE_FLOAT32_TYPE float -# define SIMDE_FLOAT32_C(value) value##f -#else -# define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE) value) -#endif -typedef SIMDE_FLOAT32_TYPE simde_float32; - -#if !defined(SIMDE_FLOAT64_TYPE) -# define SIMDE_FLOAT64_TYPE double -# define SIMDE_FLOAT64_C(value) value -#else -# define SIMDE_FLOAT64_C(value) ((SIMDE_FLOAT64_TYPE) value) -#endif -typedef SIMDE_FLOAT64_TYPE simde_float64; - -#if defined(SIMDE_POLY8_TYPE) -# undef SIMDE_POLY8_TYPE -#endif -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define SIMDE_POLY8_TYPE poly8_t -# define SIMDE_POLY8_C(value) (HEDLEY_STATIC_CAST(poly8_t, value)) -#else -# define SIMDE_POLY8_TYPE uint8_t -# define SIMDE_POLY8_C(value) (HEDLEY_STATIC_CAST(uint8_t, value)) -#endif -typedef SIMDE_POLY8_TYPE simde_poly8; - -#if defined(SIMDE_POLY16_TYPE) -# undef SIMDE_POLY16_TYPE -#endif -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define SIMDE_POLY16_TYPE poly16_t -# define SIMDE_POLY16_C(value) (HEDLEY_STATIC_CAST(poly16_t, value)) -#else -# define SIMDE_POLY16_TYPE uint16_t -# define SIMDE_POLY16_C(value) (HEDLEY_STATIC_CAST(uint16_t, value)) -#endif -typedef SIMDE_POLY16_TYPE simde_poly16; - -#if defined(SIMDE_POLY64_TYPE) -# undef SIMDE_POLY64_TYPE -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) -# define SIMDE_POLY64_TYPE poly64_t -# define SIMDE_POLY64_C(value) (HEDLEY_STATIC_CAST(poly64_t, value ## ull)) -#else -# define SIMDE_POLY64_TYPE uint64_t -# define SIMDE_POLY64_C(value) value ## ull -#endif -typedef SIMDE_POLY64_TYPE simde_poly64; - -#if 0 // todo box2d fix gcc build -#if defined(SIMDE_POLY128_TYPE) -# undef SIMDE_POLY128_TYPE -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) -# define SIMDE_POLY128_TYPE poly128_t -# define SIMDE_POLY128_C(value) value -#elif defined(__SIZEOF_INT128__) -# define SIMDE_POLY128_TYPE __int128 -# define SIMDE_POLY128_C(value) (HEDLEY_STATIC_CAST(__int128, value)) -#else -# define SIMDE_POLY128_TYPE uint64_t -# define SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE 1 -#endif -typedef SIMDE_POLY128_TYPE simde_poly128; -#endif - -#if defined(__cplusplus) - typedef bool simde_bool; -#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) - typedef _Bool simde_bool; -#elif defined(bool) - typedef bool simde_bool; -#else - #include - typedef bool simde_bool; -#endif - -#if HEDLEY_HAS_WARNING("-Wbad-function-cast") -# define SIMDE_CONVERT_FTOI(T,v) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wbad-function-cast\"") \ - HEDLEY_STATIC_CAST(T, (v)) \ - HEDLEY_DIAGNOSTIC_POP -#else -# define SIMDE_CONVERT_FTOI(T,v) ((T) (v)) -#endif - -/* TODO: detect compilers which support this outside of C11 mode */ -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) - #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) _Generic((value), to: (value), default: (_Generic((value), from: ((to) (value))))) - #define SIMDE_CHECKED_STATIC_CAST(to, from, value) _Generic((value), to: (value), default: (_Generic((value), from: ((to) (value))))) -#else - #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) HEDLEY_REINTERPRET_CAST(to, value) - #define SIMDE_CHECKED_STATIC_CAST(to, from, value) HEDLEY_STATIC_CAST(to, value) -#endif - -#if HEDLEY_HAS_WARNING("-Wfloat-equal") -# define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("clang diagnostic ignored \"-Wfloat-equal\"") -#elif HEDLEY_GCC_VERSION_CHECK(3,0,0) -# define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") -#else -# define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL -#endif - -/* Some functions can trade accuracy for speed. For those functions - you can control the trade-off using this macro. Possible values: - - 0: prefer speed - 1: reasonable trade-offs - 2: prefer accuracy */ -#if !defined(SIMDE_ACCURACY_PREFERENCE) -# define SIMDE_ACCURACY_PREFERENCE 1 -#endif - -#if defined(__STDC_HOSTED__) -# define SIMDE_STDC_HOSTED __STDC_HOSTED__ -#else -# if \ - defined(HEDLEY_PGI_VERSION) || \ - defined(HEDLEY_MSVC_VERSION) -# define SIMDE_STDC_HOSTED 1 -# else -# define SIMDE_STDC_HOSTED 0 -# endif -#endif - -/* Try to deal with environments without a standard library. */ -#if !defined(simde_memcpy) - #if HEDLEY_HAS_BUILTIN(__builtin_memcpy) - #define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n) - #endif -#endif -#if !defined(simde_memset) - #if HEDLEY_HAS_BUILTIN(__builtin_memset) - #define simde_memset(s, c, n) __builtin_memset(s, c, n) - #endif -#endif -#if !defined(simde_memcmp) - #if HEDLEY_HAS_BUILTIN(__builtin_memcmp) - #define simde_memcmp(s1, s2, n) __builtin_memcmp(s1, s2, n) - #endif -#endif - -#if !defined(simde_memcpy) || !defined(simde_memset) || !defined(simde_memcmp) - #if !defined(SIMDE_NO_STRING_H) - #if defined(__has_include) - #if !__has_include() - #define SIMDE_NO_STRING_H - #endif - #elif (SIMDE_STDC_HOSTED == 0) - #define SIMDE_NO_STRING_H - #endif - #endif - - #if !defined(SIMDE_NO_STRING_H) - #include - #if !defined(simde_memcpy) - #define simde_memcpy(dest, src, n) memcpy(dest, src, n) - #endif - #if !defined(simde_memset) - #define simde_memset(s, c, n) memset(s, c, n) - #endif - #if !defined(simde_memcmp) - #define simde_memcmp(s1, s2, n) memcmp(s1, s2, n) - #endif - #else - /* These are meant to be portable, not fast. If you're hitting them you - * should think about providing your own (by defining the simde_memcpy - * macro prior to including any SIMDe files) or submitting a patch to - * SIMDe so we can detect your system-provided memcpy/memset, like by - * adding your compiler to the checks for __builtin_memcpy and/or - * __builtin_memset. */ - #if !defined(simde_memcpy) - SIMDE_FUNCTION_ATTRIBUTES - void - simde_memcpy_(void* dest, const void* src, size_t len) { - char* dest_ = HEDLEY_STATIC_CAST(char*, dest); - char* src_ = HEDLEY_STATIC_CAST(const char*, src); - for (size_t i = 0 ; i < len ; i++) { - dest_[i] = src_[i]; - } - } - #define simde_memcpy(dest, src, n) simde_memcpy_(dest, src, n) - #endif - - #if !defined(simde_memset) - SIMDE_FUNCTION_ATTRIBUTES - void - simde_memset_(void* s, int c, size_t len) { - char* s_ = HEDLEY_STATIC_CAST(char*, s); - char c_ = HEDLEY_STATIC_CAST(char, c); - for (size_t i = 0 ; i < len ; i++) { - s_[i] = c_[i]; - } - } - #define simde_memset(s, c, n) simde_memset_(s, c, n) - #endif - - #if !defined(simde_memcmp) - SIMDE_FUCTION_ATTRIBUTES - int - simde_memcmp_(const void *s1, const void *s2, size_t n) { - unsigned char* s1_ = HEDLEY_STATIC_CAST(unsigned char*, s1); - unsigned char* s2_ = HEDLEY_STATIC_CAST(unsigned char*, s2); - for (size_t i = 0 ; i < len ; i++) { - if (s1_[i] != s2_[i]) { - return (int) (s1_[i] - s2_[i]); - } - } - return 0; - } - #define simde_memcmp(s1, s2, n) simde_memcmp_(s1, s2, n) - #endif - #endif -#endif - -/*** Functions that quiet a signaling NaN ***/ - -static HEDLEY_INLINE -double -simde_math_quiet(double x) { - uint64_t tmp, mask; - if (!simde_math_isnan(x)) { - return x; - } - simde_memcpy(&tmp, &x, 8); - mask = 0x7ff80000; - mask <<= 32; - tmp |= mask; - simde_memcpy(&x, &tmp, 8); - return x; -} - -static HEDLEY_INLINE -float -simde_math_quietf(float x) { - uint32_t tmp; - if (!simde_math_isnanf(x)) { - return x; - } - simde_memcpy(&tmp, &x, 4); - tmp |= 0x7fc00000lu; - simde_memcpy(&x, &tmp, 4); - return x; -} - -#if defined(FE_ALL_EXCEPT) - #define SIMDE_HAVE_FENV_H -#elif defined(__has_include) - #if __has_include() - #include - #define SIMDE_HAVE_FENV_H - #endif -#elif SIMDE_STDC_HOSTED == 1 - #include - #define SIMDE_HAVE_FENV_H -#endif - -#if defined(EXIT_FAILURE) - #define SIMDE_HAVE_STDLIB_H -#elif defined(__has_include) - #if __has_include() - #include - #define SIMDE_HAVE_STDLIB_H - #endif -#elif SIMDE_STDC_HOSTED == 1 - #include - #define SIMDE_HAVE_STDLIB_H -#endif - -#if defined(__has_include) -# if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include() -# include -# elif __has_include() -# include -# endif -# if __has_include() -# include -# endif -#elif SIMDE_STDC_HOSTED == 1 -# include -# include -#endif - -#define SIMDE_DEFINE_CONVERSION_FUNCTION_(Name, T_To, T_From) \ - static HEDLEY_ALWAYS_INLINE HEDLEY_CONST SIMDE_FUNCTION_POSSIBLY_UNUSED_ \ - T_To \ - Name (T_From value) { \ - T_To r; \ - simde_memcpy(&r, &value, sizeof(r)); \ - return r; \ - } - -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float32_as_uint32, uint32_t, simde_float32) -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint32_as_float32, simde_float32, uint32_t) -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float64_as_uint64, uint64_t, simde_float64) -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint64_as_float64, simde_float64, uint64_t) - -#include "check.h" - -/* GCC/clang have a bunch of functionality in builtins which we would - * like to access, but the suffixes indicate whether the operate on - * int, long, or long long, not fixed width types (e.g., int32_t). - * we use these macros to attempt to map from fixed-width to the - * names GCC uses. Note that you should still cast the input(s) and - * return values (to/from SIMDE_BUILTIN_TYPE_*_) since often even if - * types are the same size they may not be compatible according to the - * compiler. For example, on x86 long and long lonsg are generally - * both 64 bits, but platforms vary on whether an int64_t is mapped - * to a long or long long. */ - -#include - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ - -#if (INT8_MAX == INT_MAX) && (INT8_MIN == INT_MIN) - #define SIMDE_BUILTIN_SUFFIX_8_ - #define SIMDE_BUILTIN_TYPE_8_ int -#elif (INT8_MAX == LONG_MAX) && (INT8_MIN == LONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_8_ l - #define SIMDE_BUILTIN_TYPE_8_ long -#elif (INT8_MAX == LLONG_MAX) && (INT8_MIN == LLONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_8_ ll - #define SIMDE_BUILTIN_TYPE_8_ long long -#endif - -#if (INT16_MAX == INT_MAX) && (INT16_MIN == INT_MIN) - #define SIMDE_BUILTIN_SUFFIX_16_ - #define SIMDE_BUILTIN_TYPE_16_ int -#elif (INT16_MAX == LONG_MAX) && (INT16_MIN == LONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_16_ l - #define SIMDE_BUILTIN_TYPE_16_ long -#elif (INT16_MAX == LLONG_MAX) && (INT16_MIN == LLONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_16_ ll - #define SIMDE_BUILTIN_TYPE_16_ long long -#endif - -#if (INT32_MAX == INT_MAX) && (INT32_MIN == INT_MIN) - #define SIMDE_BUILTIN_SUFFIX_32_ - #define SIMDE_BUILTIN_TYPE_32_ int -#elif (INT32_MAX == LONG_MAX) && (INT32_MIN == LONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_32_ l - #define SIMDE_BUILTIN_TYPE_32_ long -#elif (INT32_MAX == LLONG_MAX) && (INT32_MIN == LLONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_32_ ll - #define SIMDE_BUILTIN_TYPE_32_ long long -#endif - -#if (INT64_MAX == INT_MAX) && (INT64_MIN == INT_MIN) - #define SIMDE_BUILTIN_SUFFIX_64_ - #define SIMDE_BUILTIN_TYPE_64_ int -#elif (INT64_MAX == LONG_MAX) && (INT64_MIN == LONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_64_ l - #define SIMDE_BUILTIN_TYPE_64_ long -#elif (INT64_MAX == LLONG_MAX) && (INT64_MIN == LLONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_64_ ll - #define SIMDE_BUILTIN_TYPE_64_ long long -#endif - -/* SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ */ -HEDLEY_DIAGNOSTIC_POP - -#if defined(SIMDE_BUILTIN_SUFFIX_8_) - #define SIMDE_BUILTIN_8_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_) - #define SIMDE_BUILTIN_HAS_8_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_)) -#else - #define SIMDE_BUILTIN_HAS_8_(name) 0 -#endif -#if defined(SIMDE_BUILTIN_SUFFIX_16_) - #define SIMDE_BUILTIN_16_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_) - #define SIMDE_BUILTIN_HAS_16_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_)) -#else - #define SIMDE_BUILTIN_HAS_16_(name) 0 -#endif -#if defined(SIMDE_BUILTIN_SUFFIX_32_) - #define SIMDE_BUILTIN_32_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_) - #define SIMDE_BUILTIN_HAS_32_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_)) -#else - #define SIMDE_BUILTIN_HAS_32_(name) 0 -#endif -#if defined(SIMDE_BUILTIN_SUFFIX_64_) - #define SIMDE_BUILTIN_64_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_) - #define SIMDE_BUILTIN_HAS_64_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_)) -#else - #define SIMDE_BUILTIN_HAS_64_(name) 0 -#endif - -#if !defined(__cplusplus) - #if defined(__clang__) - #if HEDLEY_HAS_WARNING("-Wc11-extensions") - #define SIMDE_GENERIC_(...) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc11-extensions\"") \ - _Generic(__VA_ARGS__); \ - HEDLEY_DIAGNOSTIC_POP \ - })) - #elif HEDLEY_HAS_WARNING("-Wc1x-extensions") - #define SIMDE_GENERIC_(...) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc1x-extensions\"") \ - _Generic(__VA_ARGS__); \ - HEDLEY_DIAGNOSTIC_POP \ - })) - #endif - #elif \ - defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) || \ - HEDLEY_HAS_EXTENSION(c_generic_selections) || \ - HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ - HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,3,0) - #define SIMDE_GENERIC_(...) _Generic(__VA_ARGS__) - #endif -#endif - -/* Sometimes we run into problems with specific versions of compilers - which make the native versions unusable for us. Often this is due - to missing functions, sometimes buggy implementations, etc. These - macros are how we check for specific bugs. As they are fixed we'll - start only defining them for problematic compiler versions. */ - -#if !defined(SIMDE_IGNORE_COMPILER_BUGS) -# if defined(HEDLEY_GCC_VERSION) -# if !HEDLEY_GCC_VERSION_CHECK(4,9,0) -# define SIMDE_BUG_GCC_REV_208793 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(5,0,0) -# define SIMDE_BUG_GCC_BAD_MM_SRA_EPI32 /* TODO: find relevant bug or commit */ -# endif -# if !HEDLEY_GCC_VERSION_CHECK(6,0,0) -# define SIMDE_BUG_GCC_SIZEOF_IMMEDIATE -# endif -# if !HEDLEY_GCC_VERSION_CHECK(4,6,0) -# define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */ -# endif -# if !HEDLEY_GCC_VERSION_CHECK(8,0,0) -# define SIMDE_BUG_GCC_REV_247851 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(10,0,0) -# define SIMDE_BUG_GCC_REV_274313 -# define SIMDE_BUG_GCC_91341 -# define SIMDE_BUG_GCC_92035 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_ARM_SHIFT_SCALAR -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_BAD_VEXT_REV32 -# endif -# if !(HEDLEY_GCC_VERSION_CHECK(9,4,0) \ - || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && !HEDLEY_GCC_VERSION_CHECK(9,0,0)) \ - ) && defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64) -# define SIMDE_BUG_GCC_94482 -# endif -# if (defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) || defined(SIMDE_ARCH_ZARCH) -# define SIMDE_BUG_GCC_53784 -# endif -# if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) -# if HEDLEY_GCC_VERSION_CHECK(4,3,0) /* -Wsign-conversion */ -# define SIMDE_BUG_GCC_95144 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(11,2,0) -# define SIMDE_BUG_GCC_95483 -# endif -# if defined(__OPTIMIZE__) -# define SIMDE_BUG_GCC_100927 -# endif -# if !(HEDLEY_GCC_VERSION_CHECK(10,3,0)) -# define SIMDE_BUG_GCC_98521 -# endif -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,4,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_94488 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,1,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_REV_264019 -# endif -# if (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && !defined(SIMDE_ARCH_AARCH64)) || (!defined(SIMDE_ARCH_AARCH64) && defined(SIMDE_ARCH_ARM)) -# define SIMDE_BUG_GCC_REV_260989 -# endif -# if defined(SIMDE_ARCH_ARM) && !defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_95399 -# define SIMDE_BUG_GCC_95471 -# define SIMDE_BUG_GCC_111609 -# elif defined(SIMDE_ARCH_POWER) -# define SIMDE_BUG_GCC_95227 -# define SIMDE_BUG_GCC_95782 -# if !HEDLEY_GCC_VERSION_CHECK(12,0,0) -# define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS -# endif -# elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) -# if !HEDLEY_GCC_VERSION_CHECK(10,2,0) && !defined(__OPTIMIZE__) -# define SIMDE_BUG_GCC_96174 -# endif -# elif defined(SIMDE_ARCH_ZARCH) -# define SIMDE_BUG_GCC_95782 -# if HEDLEY_GCC_VERSION_CHECK(10,0,0) -# define SIMDE_BUG_GCC_101614 -# endif -# endif -# if defined(SIMDE_ARCH_MIPS_MSA) -# define SIMDE_BUG_GCC_97248 -# if !HEDLEY_GCC_VERSION_CHECK(12,1,0) -# define SIMDE_BUG_GCC_100760 -# define SIMDE_BUG_GCC_100761 -# define SIMDE_BUG_GCC_100762 -# endif -# endif -# if !defined(__OPTIMIZE__) && !(\ - HEDLEY_GCC_VERSION_CHECK(11,4,0) \ - || (HEDLEY_GCC_VERSION_CHECK(10,4,0) && !(HEDLEY_GCC_VERSION_CHECK(11,0,0))) \ - || (HEDLEY_GCC_VERSION_CHECK(9,5,0) && !(HEDLEY_GCC_VERSION_CHECK(10,0,0)))) -# define SIMDE_BUG_GCC_105339 -# endif -# elif defined(__clang__) -# if defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_CLANG_48257 // https://github.com/llvm/llvm-project/issues/47601 -# define SIMDE_BUG_CLANG_71362 // https://github.com/llvm/llvm-project/issues/71362 -# define SIMDE_BUG_CLANG_71365 // https://github.com/llvm/llvm-project/issues/71365 -# define SIMDE_BUG_CLANG_71751 // https://github.com/llvm/llvm-project/issues/71751 -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0) -# define SIMDE_BUG_CLANG_45541 -# endif -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) -# define SIMDE_BUG_CLANG_46840 -# define SIMDE_BUG_CLANG_46844 -# endif -# if SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) -# define SIMDE_BUG_CLANG_BAD_VI64_OPS -# endif -# if SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0) -# define SIMDE_BUG_CLANG_GIT_4EC445B8 -# define SIMDE_BUG_CLANG_REV_365298 /* 0464e07c8f6e3310c28eb210a4513bc2243c2a7e */ -# endif -# endif -# if defined(SIMDE_ARCH_ARM) -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) -# define SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES -# endif -# if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NATIVE) -# define SIMDE_BUG_CLANG_71763 // https://github.com/llvm/llvm-project/issues/71763 -# endif -# endif -# if defined(SIMDE_ARCH_POWER) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) -# define SIMDE_BUG_CLANG_46770 -# endif -# if defined(SIMDE_ARCH_POWER) && (SIMDE_ARCH_POWER == 700) && (SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)) -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(13,0,0) -# define SIMDE_BUG_CLANG_50893 -# define SIMDE_BUG_CLANG_50901 -# endif -# endif -# if defined(_ARCH_PWR9) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) && !defined(__OPTIMIZE__) -# define SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT -# endif -# if defined(SIMDE_ARCH_POWER) -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(14,0,0) -# define SIMDE_BUG_CLANG_50932 -# endif -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) -# define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS -# endif -# endif -# if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) -# if SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) -# define SIMDE_BUG_CLANG_REV_298042 /* 6afc436a7817a52e78ae7bcdc3faafd460124cac */ -# endif -# if SIMDE_DETECT_CLANG_VERSION_NOT(3,7,0) -# define SIMDE_BUG_CLANG_REV_234560 /* b929ad7b1726a32650a8051f69a747fb6836c540 */ -# endif -# if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) && SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) -# define SIMDE_BUG_CLANG_BAD_MADD -# endif -# if SIMDE_DETECT_CLANG_VERSION_CHECK(4,0,0) && SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) -# define SIMDE_BUG_CLANG_REV_299346 /* ac9959eb533a58482ea4da6c4db1e635a98de384 */ -# endif -# if SIMDE_DETECT_CLANG_VERSION_NOT(8,0,0) -# define SIMDE_BUG_CLANG_REV_344862 /* eae26bf73715994c2bd145f9b6dc3836aa4ffd4f */ -# endif -# if HEDLEY_HAS_WARNING("-Wsign-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) -# define SIMDE_BUG_CLANG_45931 -# endif -# if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) -# define SIMDE_BUG_CLANG_44589 -# endif -# define SIMDE_BUG_CLANG_48673 // https://github.com/llvm/llvm-project/issues/48017 -# endif -# define SIMDE_BUG_CLANG_45959 // https://github.com/llvm/llvm-project/issues/45304 -# if defined(SIMDE_ARCH_WASM_SIMD128) && !SIMDE_DETECT_CLANG_VERSION_CHECK(17,0,0) -# define SIMDE_BUG_CLANG_60655 -# endif -# elif defined(HEDLEY_MSVC_VERSION) -# if defined(SIMDE_ARCH_X86) -# define SIMDE_BUG_MSVC_ROUND_EXTRACT -# endif -# elif defined(HEDLEY_INTEL_VERSION) -# define SIMDE_BUG_INTEL_857088 -# elif defined(HEDLEY_MCST_LCC_VERSION) -# define SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS -# define SIMDE_BUG_MCST_LCC_MISSING_CMOV_M256 -# define SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT -# elif defined(HEDLEY_PGI_VERSION) -# define SIMDE_BUG_PGI_30104 -# define SIMDE_BUG_PGI_30107 -# define SIMDE_BUG_PGI_30106 -# endif -#endif - -/* GCC and Clang both have the same issue: - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95144 - * https://bugs.llvm.org/show_bug.cgi?id=45931 - * This is just an easy way to work around it. - */ -#if \ - (HEDLEY_HAS_WARNING("-Wsign-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0)) || \ - HEDLEY_GCC_VERSION_CHECK(4,3,0) -# define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("GCC diagnostic ignored \"-Wsign-conversion\"") \ - __typeof__(expr) simde_bug_ignore_sign_conversion_v_= (expr); \ - HEDLEY_DIAGNOSTIC_POP \ - simde_bug_ignore_sign_conversion_v_; \ - })) -#else -# define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (expr) -#endif - -/* Usually the shift count is signed (for example, NEON or SSE). - * OTOH, unsigned is good for PPC (vec_srl uses unsigned), and the only option for E2K. - * Further info: https://github.com/simd-everywhere/simde/pull/700 - */ -#if defined(SIMDE_ARCH_E2K) || defined(SIMDE_ARCH_POWER) - #define SIMDE_CAST_VECTOR_SHIFT_COUNT(width, value) HEDLEY_STATIC_CAST(uint##width##_t, (value)) -#else - #define SIMDE_CAST_VECTOR_SHIFT_COUNT(width, value) HEDLEY_STATIC_CAST(int##width##_t, (value)) -#endif - -/* SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ */ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_COMMON_H) */ diff --git a/extern/simde/simde-constify.h b/extern/simde/simde-constify.h deleted file mode 100644 index 94a9d3897..000000000 --- a/extern/simde/simde-constify.h +++ /dev/null @@ -1,397 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -/* Constify macros. For internal use only. - * - * These are used to make it possible to call a function which takes - * an Integer Constant Expression (ICE) using a compile time constant. - * Technically it would also be possible to use a value not trivially - * known by the compiler, but there would be a siginficant performance - * hit (a switch switch is used). - * - * The basic idea is pretty simple; we just emit a do while loop which - * contains a switch with a case for every possible value of the - * constant. - * - * As long as the value you pass to the function in constant, pretty - * much any copmiler shouldn't have a problem generating exactly the - * same code as if you had used an ICE. - * - * This is intended to be used in the SIMDe implementations of - * functions the compilers require to be an ICE, but the other benefit - * is that if we also disable the warnings from - * SIMDE_REQUIRE_CONSTANT_RANGE we can actually just allow the tests - * to use non-ICE parameters - */ - -#if !defined(SIMDE_CONSTIFY_H) -#define SIMDE_CONSTIFY_H - -#include "simde-diagnostic.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ -SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ - -#define SIMDE_CONSTIFY_2_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_4_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_8_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - case 4: result = func_name(__VA_ARGS__, 4); break; \ - case 5: result = func_name(__VA_ARGS__, 5); break; \ - case 6: result = func_name(__VA_ARGS__, 6); break; \ - case 7: result = func_name(__VA_ARGS__, 7); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_16_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - case 4: result = func_name(__VA_ARGS__, 4); break; \ - case 5: result = func_name(__VA_ARGS__, 5); break; \ - case 6: result = func_name(__VA_ARGS__, 6); break; \ - case 7: result = func_name(__VA_ARGS__, 7); break; \ - case 8: result = func_name(__VA_ARGS__, 8); break; \ - case 9: result = func_name(__VA_ARGS__, 9); break; \ - case 10: result = func_name(__VA_ARGS__, 10); break; \ - case 11: result = func_name(__VA_ARGS__, 11); break; \ - case 12: result = func_name(__VA_ARGS__, 12); break; \ - case 13: result = func_name(__VA_ARGS__, 13); break; \ - case 14: result = func_name(__VA_ARGS__, 14); break; \ - case 15: result = func_name(__VA_ARGS__, 15); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_32_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - case 4: result = func_name(__VA_ARGS__, 4); break; \ - case 5: result = func_name(__VA_ARGS__, 5); break; \ - case 6: result = func_name(__VA_ARGS__, 6); break; \ - case 7: result = func_name(__VA_ARGS__, 7); break; \ - case 8: result = func_name(__VA_ARGS__, 8); break; \ - case 9: result = func_name(__VA_ARGS__, 9); break; \ - case 10: result = func_name(__VA_ARGS__, 10); break; \ - case 11: result = func_name(__VA_ARGS__, 11); break; \ - case 12: result = func_name(__VA_ARGS__, 12); break; \ - case 13: result = func_name(__VA_ARGS__, 13); break; \ - case 14: result = func_name(__VA_ARGS__, 14); break; \ - case 15: result = func_name(__VA_ARGS__, 15); break; \ - case 16: result = func_name(__VA_ARGS__, 16); break; \ - case 17: result = func_name(__VA_ARGS__, 17); break; \ - case 18: result = func_name(__VA_ARGS__, 18); break; \ - case 19: result = func_name(__VA_ARGS__, 19); break; \ - case 20: result = func_name(__VA_ARGS__, 20); break; \ - case 21: result = func_name(__VA_ARGS__, 21); break; \ - case 22: result = func_name(__VA_ARGS__, 22); break; \ - case 23: result = func_name(__VA_ARGS__, 23); break; \ - case 24: result = func_name(__VA_ARGS__, 24); break; \ - case 25: result = func_name(__VA_ARGS__, 25); break; \ - case 26: result = func_name(__VA_ARGS__, 26); break; \ - case 27: result = func_name(__VA_ARGS__, 27); break; \ - case 28: result = func_name(__VA_ARGS__, 28); break; \ - case 29: result = func_name(__VA_ARGS__, 29); break; \ - case 30: result = func_name(__VA_ARGS__, 30); break; \ - case 31: result = func_name(__VA_ARGS__, 31); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_64_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - case 4: result = func_name(__VA_ARGS__, 4); break; \ - case 5: result = func_name(__VA_ARGS__, 5); break; \ - case 6: result = func_name(__VA_ARGS__, 6); break; \ - case 7: result = func_name(__VA_ARGS__, 7); break; \ - case 8: result = func_name(__VA_ARGS__, 8); break; \ - case 9: result = func_name(__VA_ARGS__, 9); break; \ - case 10: result = func_name(__VA_ARGS__, 10); break; \ - case 11: result = func_name(__VA_ARGS__, 11); break; \ - case 12: result = func_name(__VA_ARGS__, 12); break; \ - case 13: result = func_name(__VA_ARGS__, 13); break; \ - case 14: result = func_name(__VA_ARGS__, 14); break; \ - case 15: result = func_name(__VA_ARGS__, 15); break; \ - case 16: result = func_name(__VA_ARGS__, 16); break; \ - case 17: result = func_name(__VA_ARGS__, 17); break; \ - case 18: result = func_name(__VA_ARGS__, 18); break; \ - case 19: result = func_name(__VA_ARGS__, 19); break; \ - case 20: result = func_name(__VA_ARGS__, 20); break; \ - case 21: result = func_name(__VA_ARGS__, 21); break; \ - case 22: result = func_name(__VA_ARGS__, 22); break; \ - case 23: result = func_name(__VA_ARGS__, 23); break; \ - case 24: result = func_name(__VA_ARGS__, 24); break; \ - case 25: result = func_name(__VA_ARGS__, 25); break; \ - case 26: result = func_name(__VA_ARGS__, 26); break; \ - case 27: result = func_name(__VA_ARGS__, 27); break; \ - case 28: result = func_name(__VA_ARGS__, 28); break; \ - case 29: result = func_name(__VA_ARGS__, 29); break; \ - case 30: result = func_name(__VA_ARGS__, 30); break; \ - case 31: result = func_name(__VA_ARGS__, 31); break; \ - case 32: result = func_name(__VA_ARGS__, 32); break; \ - case 33: result = func_name(__VA_ARGS__, 33); break; \ - case 34: result = func_name(__VA_ARGS__, 34); break; \ - case 35: result = func_name(__VA_ARGS__, 35); break; \ - case 36: result = func_name(__VA_ARGS__, 36); break; \ - case 37: result = func_name(__VA_ARGS__, 37); break; \ - case 38: result = func_name(__VA_ARGS__, 38); break; \ - case 39: result = func_name(__VA_ARGS__, 39); break; \ - case 40: result = func_name(__VA_ARGS__, 40); break; \ - case 41: result = func_name(__VA_ARGS__, 41); break; \ - case 42: result = func_name(__VA_ARGS__, 42); break; \ - case 43: result = func_name(__VA_ARGS__, 43); break; \ - case 44: result = func_name(__VA_ARGS__, 44); break; \ - case 45: result = func_name(__VA_ARGS__, 45); break; \ - case 46: result = func_name(__VA_ARGS__, 46); break; \ - case 47: result = func_name(__VA_ARGS__, 47); break; \ - case 48: result = func_name(__VA_ARGS__, 48); break; \ - case 49: result = func_name(__VA_ARGS__, 49); break; \ - case 50: result = func_name(__VA_ARGS__, 50); break; \ - case 51: result = func_name(__VA_ARGS__, 51); break; \ - case 52: result = func_name(__VA_ARGS__, 52); break; \ - case 53: result = func_name(__VA_ARGS__, 53); break; \ - case 54: result = func_name(__VA_ARGS__, 54); break; \ - case 55: result = func_name(__VA_ARGS__, 55); break; \ - case 56: result = func_name(__VA_ARGS__, 56); break; \ - case 57: result = func_name(__VA_ARGS__, 57); break; \ - case 58: result = func_name(__VA_ARGS__, 58); break; \ - case 59: result = func_name(__VA_ARGS__, 59); break; \ - case 60: result = func_name(__VA_ARGS__, 60); break; \ - case 61: result = func_name(__VA_ARGS__, 61); break; \ - case 62: result = func_name(__VA_ARGS__, 62); break; \ - case 63: result = func_name(__VA_ARGS__, 63); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_2_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_4_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_8_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - case 4: func_name(__VA_ARGS__, 4); break; \ - case 5: func_name(__VA_ARGS__, 5); break; \ - case 6: func_name(__VA_ARGS__, 6); break; \ - case 7: func_name(__VA_ARGS__, 7); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_16_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - case 4: func_name(__VA_ARGS__, 4); break; \ - case 5: func_name(__VA_ARGS__, 5); break; \ - case 6: func_name(__VA_ARGS__, 6); break; \ - case 7: func_name(__VA_ARGS__, 7); break; \ - case 8: func_name(__VA_ARGS__, 8); break; \ - case 9: func_name(__VA_ARGS__, 9); break; \ - case 10: func_name(__VA_ARGS__, 10); break; \ - case 11: func_name(__VA_ARGS__, 11); break; \ - case 12: func_name(__VA_ARGS__, 12); break; \ - case 13: func_name(__VA_ARGS__, 13); break; \ - case 14: func_name(__VA_ARGS__, 14); break; \ - case 15: func_name(__VA_ARGS__, 15); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_32_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - case 4: func_name(__VA_ARGS__, 4); break; \ - case 5: func_name(__VA_ARGS__, 5); break; \ - case 6: func_name(__VA_ARGS__, 6); break; \ - case 7: func_name(__VA_ARGS__, 7); break; \ - case 8: func_name(__VA_ARGS__, 8); break; \ - case 9: func_name(__VA_ARGS__, 9); break; \ - case 10: func_name(__VA_ARGS__, 10); break; \ - case 11: func_name(__VA_ARGS__, 11); break; \ - case 12: func_name(__VA_ARGS__, 12); break; \ - case 13: func_name(__VA_ARGS__, 13); break; \ - case 14: func_name(__VA_ARGS__, 14); break; \ - case 15: func_name(__VA_ARGS__, 15); break; \ - case 16: func_name(__VA_ARGS__, 16); break; \ - case 17: func_name(__VA_ARGS__, 17); break; \ - case 18: func_name(__VA_ARGS__, 18); break; \ - case 19: func_name(__VA_ARGS__, 19); break; \ - case 20: func_name(__VA_ARGS__, 20); break; \ - case 21: func_name(__VA_ARGS__, 21); break; \ - case 22: func_name(__VA_ARGS__, 22); break; \ - case 23: func_name(__VA_ARGS__, 23); break; \ - case 24: func_name(__VA_ARGS__, 24); break; \ - case 25: func_name(__VA_ARGS__, 25); break; \ - case 26: func_name(__VA_ARGS__, 26); break; \ - case 27: func_name(__VA_ARGS__, 27); break; \ - case 28: func_name(__VA_ARGS__, 28); break; \ - case 29: func_name(__VA_ARGS__, 29); break; \ - case 30: func_name(__VA_ARGS__, 30); break; \ - case 31: func_name(__VA_ARGS__, 31); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_64_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - case 4: func_name(__VA_ARGS__, 4); break; \ - case 5: func_name(__VA_ARGS__, 5); break; \ - case 6: func_name(__VA_ARGS__, 6); break; \ - case 7: func_name(__VA_ARGS__, 7); break; \ - case 8: func_name(__VA_ARGS__, 8); break; \ - case 9: func_name(__VA_ARGS__, 9); break; \ - case 10: func_name(__VA_ARGS__, 10); break; \ - case 11: func_name(__VA_ARGS__, 11); break; \ - case 12: func_name(__VA_ARGS__, 12); break; \ - case 13: func_name(__VA_ARGS__, 13); break; \ - case 14: func_name(__VA_ARGS__, 14); break; \ - case 15: func_name(__VA_ARGS__, 15); break; \ - case 16: func_name(__VA_ARGS__, 16); break; \ - case 17: func_name(__VA_ARGS__, 17); break; \ - case 18: func_name(__VA_ARGS__, 18); break; \ - case 19: func_name(__VA_ARGS__, 19); break; \ - case 20: func_name(__VA_ARGS__, 20); break; \ - case 21: func_name(__VA_ARGS__, 21); break; \ - case 22: func_name(__VA_ARGS__, 22); break; \ - case 23: func_name(__VA_ARGS__, 23); break; \ - case 24: func_name(__VA_ARGS__, 24); break; \ - case 25: func_name(__VA_ARGS__, 25); break; \ - case 26: func_name(__VA_ARGS__, 26); break; \ - case 27: func_name(__VA_ARGS__, 27); break; \ - case 28: func_name(__VA_ARGS__, 28); break; \ - case 29: func_name(__VA_ARGS__, 29); break; \ - case 30: func_name(__VA_ARGS__, 30); break; \ - case 31: func_name(__VA_ARGS__, 31); break; \ - case 32: func_name(__VA_ARGS__, 32); break; \ - case 33: func_name(__VA_ARGS__, 33); break; \ - case 34: func_name(__VA_ARGS__, 34); break; \ - case 35: func_name(__VA_ARGS__, 35); break; \ - case 36: func_name(__VA_ARGS__, 36); break; \ - case 37: func_name(__VA_ARGS__, 37); break; \ - case 38: func_name(__VA_ARGS__, 38); break; \ - case 39: func_name(__VA_ARGS__, 39); break; \ - case 40: func_name(__VA_ARGS__, 40); break; \ - case 41: func_name(__VA_ARGS__, 41); break; \ - case 42: func_name(__VA_ARGS__, 42); break; \ - case 43: func_name(__VA_ARGS__, 43); break; \ - case 44: func_name(__VA_ARGS__, 44); break; \ - case 45: func_name(__VA_ARGS__, 45); break; \ - case 46: func_name(__VA_ARGS__, 46); break; \ - case 47: func_name(__VA_ARGS__, 47); break; \ - case 48: func_name(__VA_ARGS__, 48); break; \ - case 49: func_name(__VA_ARGS__, 49); break; \ - case 50: func_name(__VA_ARGS__, 50); break; \ - case 51: func_name(__VA_ARGS__, 51); break; \ - case 52: func_name(__VA_ARGS__, 52); break; \ - case 53: func_name(__VA_ARGS__, 53); break; \ - case 54: func_name(__VA_ARGS__, 54); break; \ - case 55: func_name(__VA_ARGS__, 55); break; \ - case 56: func_name(__VA_ARGS__, 56); break; \ - case 57: func_name(__VA_ARGS__, 57); break; \ - case 58: func_name(__VA_ARGS__, 58); break; \ - case 59: func_name(__VA_ARGS__, 59); break; \ - case 60: func_name(__VA_ARGS__, 60); break; \ - case 61: func_name(__VA_ARGS__, 61); break; \ - case 62: func_name(__VA_ARGS__, 62); break; \ - case 63: func_name(__VA_ARGS__, 63); break; \ - default: default_case; break; \ - } \ - } while (0) - -HEDLEY_DIAGNOSTIC_POP - -#endif diff --git a/extern/simde/simde-detect-clang.h b/extern/simde/simde-detect-clang.h deleted file mode 100644 index 15d695a89..000000000 --- a/extern/simde/simde-detect-clang.h +++ /dev/null @@ -1,125 +0,0 @@ -/* Detect Clang Version - * Created by Evan Nemerson - * - * To the extent possible under law, the author(s) have dedicated all - * copyright and related and neighboring rights to this software to - * the public domain worldwide. This software is distributed without - * any warranty. - * - * For details, see . - * SPDX-License-Identifier: CC0-1.0 - */ - -/* This file was originally part of SIMDe - * (). You're free to do with it as - * you please, but I do have a few small requests: - * - * * If you make improvements, please submit them back to SIMDe - * (at ) so others can - * benefit from them. - * * Please keep a link to SIMDe intact so people know where to submit - * improvements. - * * If you expose it publicly, please change the SIMDE_ prefix to - * something specific to your project. - * - * The version numbers clang exposes (in the ___clang_major__, - * __clang_minor__, and __clang_patchlevel__ macros) are unreliable. - * Vendors such as Apple will define these values to their version - * numbers; for example, "Apple Clang 4.0" is really clang 3.1, but - * __clang_major__ and __clang_minor__ are defined to 4 and 0 - * respectively, instead of 3 and 1. - * - * The solution is *usually* to use clang's feature detection macros - * () - * to determine if the feature you're interested in is available. This - * generally works well, and it should probably be the first thing you - * try. Unfortunately, it's not possible to check for everything. In - * particular, compiler bugs. - * - * This file just uses the feature checking macros to detect features - * added in specific versions of clang to identify which version of - * clang the compiler is based on. - * - * Right now it only goes back to 3.6, but I'm happy to accept patches - * to go back further. And, of course, newer versions are welcome if - * they're not already present, and if you find a way to detect a point - * release that would be great, too! - */ - -#if !defined(SIMDE_DETECT_CLANG_H) -#define SIMDE_DETECT_CLANG_H 1 - -/* Attempt to detect the upstream clang version number. I usually only - * worry about major version numbers (at least for 4.0+), but if you - * need more resolution I'm happy to accept patches that are able to - * detect minor versions as well. That said, you'll probably have a - * hard time with detection since AFAIK most minor releases don't add - * anything we can detect. Updated based on - * https://github.com/google/highway/blob/438c705a295176b96a50336527bb3e7ea365ffac/hwy/detect_compiler_arch.h#L73 - * - would welcome patches/updates there as well. - */ - -#if defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) -# if __has_attribute(nouwtable) // no new warnings in 16.0 -# define SIMDE_DETECT_CLANG_VERSION 160000 -# elif __has_warning("-Warray-parameter") -# define SIMDE_DETECT_CLANG_VERSION 150000 -# elif __has_warning("-Wbitwise-instead-of-logical") -# define SIMDE_DETECT_CLANG_VERSION 140000 -# elif __has_warning("-Wwaix-compat") -# define SIMDE_DETECT_CLANG_VERSION 130000 -# elif __has_warning("-Wformat-insufficient-args") -# define SIMDE_DETECT_CLANG_VERSION 120000 -# elif __has_warning("-Wimplicit-const-int-float-conversion") -# define SIMDE_DETECT_CLANG_VERSION 110000 -# elif __has_warning("-Wmisleading-indentation") -# define SIMDE_DETECT_CLANG_VERSION 100000 -# elif defined(__FILE_NAME__) -# define SIMDE_DETECT_CLANG_VERSION 90000 -# elif __has_warning("-Wextra-semi-stmt") || __has_builtin(__builtin_rotateleft32) -# define SIMDE_DETECT_CLANG_VERSION 80000 -// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently -// based on Clang 7, but does not support the warning we test. -// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and -// https://trac.macports.org/wiki/XcodeVersionInfo. -# elif __has_warning("-Wc++98-compat-extra-semi") || \ - (defined(__apple_build_version__) && __apple_build_version__ >= 10010000) -# define SIMDE_DETECT_CLANG_VERSION 70000 -# elif __has_warning("-Wpragma-pack") -# define SIMDE_DETECT_CLANG_VERSION 60000 -# elif __has_warning("-Wbitfield-enum-conversion") -# define SIMDE_DETECT_CLANG_VERSION 50000 -# elif __has_attribute(diagnose_if) -# define SIMDE_DETECT_CLANG_VERSION 40000 -# elif __has_warning("-Wcomma") -# define SIMDE_DETECT_CLANG_VERSION 39000 -# elif __has_warning("-Wdouble-promotion") -# define SIMDE_DETECT_CLANG_VERSION 38000 -# elif __has_warning("-Wshift-negative-value") -# define SIMDE_DETECT_CLANG_VERSION 37000 -# elif __has_warning("-Wambiguous-ellipsis") -# define SIMDE_DETECT_CLANG_VERSION 36000 -# else -# define SIMDE_DETECT_CLANG_VERSION 1 -# endif -#endif /* defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) */ - -/* The SIMDE_DETECT_CLANG_VERSION_CHECK macro is pretty - * straightforward; it returns true if the compiler is a derivative - * of clang >= the specified version. - * - * Since this file is often (primarily?) useful for working around bugs - * it is also helpful to have a macro which returns true if only if the - * compiler is a version of clang *older* than the specified version to - * make it a bit easier to ifdef regions to add code for older versions, - * such as pragmas to disable a specific warning. */ - -#if defined(SIMDE_DETECT_CLANG_VERSION) -# define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION >= ((major * 10000) + (minor * 1000) + (revision))) -# define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION < ((major * 10000) + (minor * 1000) + (revision))) -#else -# define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (0) -# define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (0) -#endif - -#endif /* !defined(SIMDE_DETECT_CLANG_H) */ diff --git a/extern/simde/simde-diagnostic.h b/extern/simde/simde-diagnostic.h deleted file mode 100644 index a525d3a2a..000000000 --- a/extern/simde/simde-diagnostic.h +++ /dev/null @@ -1,456 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -/* SIMDe targets a very wide range of standards and compilers, and our - * goal is to compile cleanly even with extremely aggressive warnings - * (i.e., -Weverything in clang, -Wextra in GCC, /W4 for MSVC, etc.) - * treated as errors. - * - * While our preference is to resolve the underlying issue a given - * diagnostic is warning us about, sometimes that's not possible. - * Fixing a warning in one compiler may cause problems in another. - * Sometimes a warning doesn't really apply to us (false positives), - * and sometimes adhering to a warning would mean dropping a feature - * we *know* the compiler supports since we have tested specifically - * for the compiler or feature. - * - * When practical, warnings are only disabled for specific code. For - * a list of warnings which are enabled by default in all SIMDe code, - * see SIMDE_DISABLE_UNWANTED_DIAGNOSTICS. Note that we restore the - * warning stack when SIMDe is done parsing, so code which includes - * SIMDe is not deprived of these warnings. - */ - -#if !defined(SIMDE_DIAGNOSTIC_H) -#define SIMDE_DIAGNOSTIC_H - -#include "hedley.h" -#include "simde-detect-clang.h" -#include "simde-arch.h" - -/* This is only to help us implement functions like _mm_undefined_ps. */ -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - #undef SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif -#if HEDLEY_HAS_WARNING("-Wuninitialized") - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("clang diagnostic ignored \"-Wuninitialized\"") -#elif HEDLEY_GCC_VERSION_CHECK(4,2,0) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("GCC diagnostic ignored \"-Wuninitialized\"") -#elif HEDLEY_PGI_VERSION_CHECK(19,10,0) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 549") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE,unassigned)") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE)") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) && defined(__cplusplus) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,unassigned)") -#elif \ - HEDLEY_TI_VERSION_CHECK(16,9,9) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 551") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("warning(disable:592)") -#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) && !defined(__MSVC_RUNTIME_CHECKS) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ __pragma(warning(disable:4700)) -#endif - -/* GCC emits a lot of "notes" about the ABI being different for things - * in newer versions of GCC. We don't really care because all our - * functions are inlined and don't generate ABI. */ -#if HEDLEY_GCC_VERSION_CHECK(7,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ _Pragma("GCC diagnostic ignored \"-Wpsabi\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ -#endif - -/* Since MMX uses x87 FP registers, you're supposed to call _mm_empty() - * after each MMX function before any floating point instructions. - * Some compilers warn about functions which use MMX functions but - * don't call _mm_empty(). However, since SIMDe is implementyng the - * MMX API we shouldn't be calling _mm_empty(); we leave it to the - * caller to invoke simde_mm_empty(). */ -#if HEDLEY_INTEL_VERSION_CHECK(19,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ _Pragma("warning(disable:13200 13203)") -#elif defined(HEDLEY_MSVC_VERSION) - #define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ __pragma(warning(disable:4799)) -#else - #define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ -#endif - -/* Intel is pushing people to use OpenMP SIMD instead of Cilk+, so they - * emit a diagnostic if you use #pragma simd instead of - * #pragma omp simd. SIMDe supports OpenMP SIMD, you just need to - * compile with -qopenmp or -qopenmp-simd and define - * SIMDE_ENABLE_OPENMP. Cilk+ is just a fallback. */ -#if HEDLEY_INTEL_VERSION_CHECK(18,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ _Pragma("warning(disable:3948)") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ -#endif - -/* MSVC emits a diagnostic when we call a function (like - * simde_mm_set_epi32) while initializing a struct. We currently do - * this a *lot* in the tests. */ -#if \ - defined(HEDLEY_MSVC_VERSION) - #define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ __pragma(warning(disable:4204)) -#else - #define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ -#endif - -/* This warning needs a lot of work. It is triggered if all you do is - * pass the value to memcpy/__builtin_memcpy, or if you initialize a - * member of the union, even if that member takes up the entire union. - * Last tested with clang-10, hopefully things will improve in the - * future; if clang fixes this I'd love to enable it. */ -#if \ - HEDLEY_HAS_WARNING("-Wconditional-uninitialized") - #define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ _Pragma("clang diagnostic ignored \"-Wconditional-uninitialized\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ -#endif - -/* This warning is meant to catch things like `0.3 + 0.4 == 0.7`, which - * will is false. However, SIMDe uses these operations exclusively - * for things like _mm_cmpeq_ps, for which we really do want to check - * for equality (or inequality). - * - * If someone wants to put together a SIMDE_FLOAT_EQUAL(a, op, b) macro - * which just wraps a check in some code do disable this diagnostic I'd - * be happy to accept it. */ -#if \ - HEDLEY_HAS_WARNING("-Wfloat-equal") || \ - HEDLEY_GCC_VERSION_CHECK(3,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ -#endif - -/* This is because we use HEDLEY_STATIC_ASSERT for static assertions. - * If Hedley can't find an implementation it will preprocess to - * nothing, which means there will be a trailing semi-colon. */ -#if HEDLEY_HAS_WARNING("-Wextra-semi") - #define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ _Pragma("clang diagnostic ignored \"-Wextra-semi\"") -#elif HEDLEY_GCC_VERSION_CHECK(8,1,0) && defined(__cplusplus) - #define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ _Pragma("GCC diagnostic ignored \"-Wextra-semi\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ -#endif - -/* We do use a few variadic macros, which technically aren't available - * until C99 and C++11, but every compiler I'm aware of has supported - * them for much longer. That said, usage is isolated to the test - * suite and compilers known to support them. */ -#if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4,0,0) - #if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") - #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ \ - _Pragma("clang diagnostic ignored \"-Wvariadic-macros\"") \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"") - #else - #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ _Pragma("GCC diagnostic ignored \"-Wvariadic-macros\"") - #endif -#else - #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ -#endif - -/* emscripten requires us to use a __wasm_unimplemented_simd128__ macro - * before we can access certain SIMD intrinsics, but this diagnostic - * warns about it being a reserved name. It is a reserved name, but - * it's reserved for the compiler and we are using it to convey - * information to the compiler. - * - * This is also used when enabling native aliases since we don't get to - * choose the macro names. */ -#if HEDLEY_HAS_WARNING("-Wreserved-id-macro") - #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_ _Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_ -#endif - -/* Similar to above; types like simde__m128i are reserved due to the - * double underscore, but we didn't choose them, Intel did. */ -#if HEDLEY_HAS_WARNING("-Wreserved-identifier") - #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_ _Pragma("clang diagnostic ignored \"-Wreserved-identifier\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_ -#endif - -/* clang 3.8 warns about the packed attribute being unnecessary when - * used in the _mm_loadu_* functions. That *may* be true for version - * 3.8, but for later versions it is crucial in order to make unaligned - * access safe. */ -#if HEDLEY_HAS_WARNING("-Wpacked") - #define SIMDE_DIAGNOSTIC_DISABLE_PACKED_ _Pragma("clang diagnostic ignored \"-Wpacked\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PACKED_ -#endif - -/* Triggered when assigning a float to a double implicitly. We use - * explicit casts in SIMDe, this is only used in the test suite. */ -#if HEDLEY_HAS_WARNING("-Wdouble-promotion") - #define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ _Pragma("clang diagnostic ignored \"-Wdouble-promotion\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ -#endif - -/* Several compilers treat conformant array parameters as VLAs. We - * test to make sure we're in C mode (C++ doesn't support CAPs), and - * that the version of the standard supports CAPs. We also reject - * some buggy compilers like MSVC (the logic is in Hedley if you want - * to take a look), but with certain warnings enabled some compilers - * still like to emit a diagnostic. */ -#if HEDLEY_HAS_WARNING("-Wvla") - #define SIMDE_DIAGNOSTIC_DISABLE_VLA_ _Pragma("clang diagnostic ignored \"-Wvla\"") -#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) - #define SIMDE_DIAGNOSTIC_DISABLE_VLA_ _Pragma("GCC diagnostic ignored \"-Wvla\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_VLA_ -#endif - -/* If you add an unused attribute to a function and don't use it, clang - * may emit this. */ -#if HEDLEY_HAS_WARNING("-Wused-but-marked-unused") - #define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ _Pragma("clang diagnostic ignored \"-Wused-but-marked-unused\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ -#endif - -#if HEDLEY_HAS_WARNING("-Wpass-failed") - #define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ _Pragma("clang diagnostic ignored \"-Wpass-failed\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ -#endif - -#if HEDLEY_HAS_WARNING("-Wpadded") - #define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ _Pragma("clang diagnostic ignored \"-Wpadded\"") -#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) /* Likely goes back further */ - #define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ __pragma(warning(disable:4324)) -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ -#endif - -#if HEDLEY_HAS_WARNING("-Wzero-as-null-pointer-constant") - #define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_ _Pragma("clang diagnostic ignored \"-Wzero-as-null-pointer-constant\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_ -#endif - -#if HEDLEY_HAS_WARNING("-Wold-style-cast") - #define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_ _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_ -#endif - -#if HEDLEY_HAS_WARNING("-Wcast-function-type") || HEDLEY_GCC_VERSION_CHECK(8,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_ _Pragma("GCC diagnostic ignored \"-Wcast-function-type\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_ -#endif - -/* clang will emit this warning when we use C99 extensions whan not in - * C99 mode, even though it does support this. In such cases we check - * the compiler and version first, so we know it's not a problem. */ -#if HEDLEY_HAS_WARNING("-Wc99-extensions") - #define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ _Pragma("clang diagnostic ignored \"-Wc99-extensions\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ -#endif - -/* Similar problm as above; we rely on some basic C99 support, but clang - * has started warning obut this even in C17 mode with -Weverything. */ -#if HEDLEY_HAS_WARNING("-Wdeclaration-after-statement") - #define SIMDE_DIAGNOSTIC_DISABLE_DECLARATION_AFTER_STATEMENT_ _Pragma("clang diagnostic ignored \"-Wdeclaration-after-statement\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_DECLARATION_AFTER_STATEMENT_ -#endif - -/* https://github.com/simd-everywhere/simde/issues/277 */ -#if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4,6,0) && !HEDLEY_GCC_VERSION_CHECK(6,4,0) && defined(__cplusplus) - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ _Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ -#endif - -/* This is the warning that you normally define _CRT_SECURE_NO_WARNINGS - * to silence, but you have to do that before including anything and - * that would require reordering includes. */ -#if defined(_MSC_VER) - #define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_ __pragma(warning(disable:4996)) -#else - #define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_ -#endif - -/* Some compilers, such as clang, may use `long long` for 64-bit - * integers, but `long long` triggers a diagnostic with - * -Wc++98-compat-pedantic which says 'long long' is incompatible with - * C++98. */ -#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") - #if HEDLEY_HAS_WARNING("-Wc++11-long-long") - #define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"") \ - _Pragma("clang diagnostic ignored \"-Wc++11-long-long\"") - #else - #define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"") - #endif -#else - #define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ -#endif - -/* Some problem as above */ -#if HEDLEY_HAS_WARNING("-Wc++11-long-long") - #define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ _Pragma("clang diagnostic ignored \"-Wc++11-long-long\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ -#endif - -/* emscripten emits this whenever stdin/stdout/stderr is used in a - * macro. */ -#if HEDLEY_HAS_WARNING("-Wdisabled-macro-expansion") - #define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_ _Pragma("clang diagnostic ignored \"-Wdisabled-macro-expansion\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_ -#endif - -/* Clang uses C11 generic selections to implement some AltiVec - * functions, which triggers this diagnostic when not compiling - * in C11 mode */ -#if HEDLEY_HAS_WARNING("-Wc11-extensions") - #define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_ _Pragma("clang diagnostic ignored \"-Wc11-extensions\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_ -#endif - -/* Clang sometimes triggers this warning in macros in the AltiVec and - * NEON headers, or due to missing functions. */ -#if HEDLEY_HAS_WARNING("-Wvector-conversion") - #define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ _Pragma("clang diagnostic ignored \"-Wvector-conversion\"") - /* For NEON, the situation with -Wvector-conversion in clang < 10 is - * bad enough that we just disable the warning altogether. On x86, - * clang has similar issues on several sse4.2+ intrinsics before 3.8. */ - #if \ - (defined(SIMDE_ARCH_ARM) && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0)) || \ - SIMDE_DETECT_CLANG_VERSION_NOT(3,8,0) - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_ SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ - #endif -#else - #define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ -#endif -#if !defined(SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_) - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_ -#endif - -/* Prior to 5.0, clang didn't support disabling diagnostics in - * statement exprs. As a result, some macros we use don't - * properly silence warnings. */ -#if SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) && HEDLEY_HAS_WARNING("-Wcast-qual") && HEDLEY_HAS_WARNING("-Wcast-align") - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ _Pragma("clang diagnostic ignored \"-Wcast-qual\"") _Pragma("clang diagnostic ignored \"-Wcast-align\"") -#elif SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) && HEDLEY_HAS_WARNING("-Wcast-qual") - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ _Pragma("clang diagnostic ignored \"-Wcast-qual\"") -#elif SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) && HEDLEY_HAS_WARNING("-Wcast-align") - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ _Pragma("clang diagnostic ignored \"-Wcast-align\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ -#endif - -/* SLEEF triggers this a *lot* in their headers */ -#if HEDLEY_HAS_WARNING("-Wignored-qualifiers") - #define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ _Pragma("clang diagnostic ignored \"-Wignored-qualifiers\"") -#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) - #define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ _Pragma("GCC diagnostic ignored \"-Wignored-qualifiers\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ -#endif - -/* GCC emits this under some circumstances when using __int128 */ -#if HEDLEY_GCC_VERSION_CHECK(4,8,0) - #define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ _Pragma("GCC diagnostic ignored \"-Wpedantic\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ -#endif - -/* MSVC doesn't like (__assume(0), code) and will warn about code being - * unreachable, but we want it there because not all compilers - * understand the unreachable macro and will complain if it is missing. - * I'm planning on adding a new macro to Hedley to handle this a bit - * more elegantly, but until then... */ -#if defined(HEDLEY_MSVC_VERSION) - #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ __pragma(warning(disable:4702)) -#elif defined(__clang__) - #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ HEDLEY_PRAGMA(clang diagnostic ignored "-Wunreachable-code") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ -#endif - -/* This is a false positive from GCC in a few places. */ -#if HEDLEY_GCC_VERSION_CHECK(4,7,0) - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ -#endif - -#if defined(SIMDE_ENABLE_NATIVE_ALIASES) - #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ \ - SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_ -#else - #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ -#endif - -/* Some native functions on E2K with instruction set < v6 are declared - * as deprecated due to inefficiency. Still they are more efficient - * than SIMDe implementation. So we're using them, and switching off - * these deprecation warnings. */ -#if defined(HEDLEY_MCST_LCC_VERSION) -# define SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS _Pragma("diag_suppress 1215,1444") -# define SIMDE_LCC_REVERT_DEPRECATED_WARNINGS _Pragma("diag_default 1215,1444") -#else -# define SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS -# define SIMDE_LCC_REVERT_DEPRECATED_WARNINGS -#endif - -#define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS \ - HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION \ - SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ \ - SIMDE_DIAGNOSTIC_DISABLE_PSABI_ \ - SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ \ - SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ \ - SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ \ - SIMDE_DIAGNOSTIC_DISABLE_DECLARATION_AFTER_STATEMENT_ \ - SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ \ - SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \ - SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ \ - SIMDE_DIAGNOSTIC_DISABLE_VLA_ \ - SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ \ - SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ \ - SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ \ - SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ \ - SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ \ - SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ \ - SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_ \ - SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_ - -#endif /* !defined(SIMDE_DIAGNOSTIC_H) */ diff --git a/extern/simde/simde-f16.h b/extern/simde/simde-f16.h deleted file mode 100644 index 632ef626a..000000000 --- a/extern/simde/simde-f16.h +++ /dev/null @@ -1,319 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2021 Evan Nemerson - */ - -#include "hedley.h" -#include "simde-common.h" -#include "simde-detect-clang.h" - -#if !defined(SIMDE_FLOAT16_H) -#define SIMDE_FLOAT16_H - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -/* Portable version which should work on pretty much any compiler. - * Obviously you can't rely on compiler support for things like - * conversion to/from 32-bit floats, so make sure you always use the - * functions and macros in this file! - * - * The portable implementations are (heavily) based on CC0 code by - * Fabian Giesen: (see also - * ). - * I have basically just modified it to get rid of some UB (lots of - * aliasing, right shifting a negative value), use fixed-width types, - * and work in C. */ -#define SIMDE_FLOAT16_API_PORTABLE 1 -/* _Float16, per C standard (TS 18661-3; - * ). */ -#define SIMDE_FLOAT16_API_FLOAT16 2 -/* clang >= 6.0 supports __fp16 as an interchange format on all - * targets, but only allows you to use them for arguments and return - * values on targets which have defined an ABI. We get around the - * restriction by wrapping the __fp16 in a struct, but we can't do - * that on Arm since it would break compatibility with the NEON F16 - * functions. */ -#define SIMDE_FLOAT16_API_FP16_NO_ABI 3 -/* This is basically __fp16 as specified by Arm, where arugments and - * return values are raw __fp16 values not structs. */ -#define SIMDE_FLOAT16_API_FP16 4 - -/* Choosing an implementation. This is a bit rough, but I don't have - * any ideas on how to improve it. If you do, patches are definitely - * welcome. */ -#if !defined(SIMDE_FLOAT16_API) - #if defined(__ARM_FP16_FORMAT_IEEE) && (defined(SIMDE_ARM_NEON_FP16) || defined(__ARM_FP16_ARGS)) - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16 - #elif !defined(__EMSCRIPTEN__) && !(defined(__clang__) && defined(SIMDE_ARCH_POWER)) && \ - !(defined(HEDLEY_MSVC_VERSION) && defined(__clang__)) && \ - !(defined(SIMDE_ARCH_MIPS) && defined(__clang__)) && \ - !(defined(__clang__) && defined(SIMDE_ARCH_RISCV64)) && ( \ - defined(SIMDE_X86_AVX512FP16_NATIVE) || \ - (defined(SIMDE_ARCH_X86_SSE2) && HEDLEY_GCC_VERSION_CHECK(12,0,0)) || \ - (defined(SIMDE_ARCH_AARCH64) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !defined(__cplusplus)) || \ - ((defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0)) || \ - (!(defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(6,0,0))) - /* We haven't found a better way to detect this. It seems like defining - * __STDC_WANT_IEC_60559_TYPES_EXT__, then including float.h, then - * checking for defined(FLT16_MAX) should work, but both gcc and - * clang will define the constants even if _Float16 is not - * supported. Ideas welcome. */ - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FLOAT16 - #elif defined(__FLT16_MIN__) && \ - (defined(__clang__) && \ - (!defined(SIMDE_ARCH_AARCH64) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) \ - && !defined(SIMDE_ARCH_RISCV64)) - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16_NO_ABI - #else - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_PORTABLE - #endif -#endif - -#if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16 - typedef _Float16 simde_float16; - #define SIMDE_FLOAT16_IS_SCALAR 1 - #if !defined(__cplusplus) - #define SIMDE_FLOAT16_C(value) value##f16 - #else - #define SIMDE_FLOAT16_C(value) HEDLEY_STATIC_CAST(_Float16, (value)) - #endif -#elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI - typedef struct { __fp16 value; } simde_float16; - #if defined(SIMDE_STATEMENT_EXPR_) && !defined(SIMDE_TESTS_H) - #define SIMDE_FLOAT16_C(value) (__extension__({ ((simde_float16) { HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ HEDLEY_STATIC_CAST(__fp16, (value)) }); HEDLEY_DIAGNOSTIC_POP })) - #else - #define SIMDE_FLOAT16_C(value) ((simde_float16) { HEDLEY_STATIC_CAST(__fp16, (value)) }) - #define SIMDE_FLOAT16_IS_SCALAR 1 - #endif -#elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 - typedef __fp16 simde_float16; - #define SIMDE_FLOAT16_IS_SCALAR 1 - #define SIMDE_FLOAT16_C(value) HEDLEY_STATIC_CAST(__fp16, (value)) -#elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_PORTABLE - typedef struct { uint16_t value; } simde_float16; -#else - #error No 16-bit floating point API. -#endif - -#if \ - defined(SIMDE_VECTOR_OPS) && \ - (SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE) && \ - (SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI) - #define SIMDE_FLOAT16_VECTOR -#endif - -/* Reinterpret -- you *generally* shouldn't need these, they're really - * intended for internal use. However, on x86 half-precision floats - * get stuffed into a __m128i/__m256i, so it may be useful. */ - -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float16_as_uint16, uint16_t, simde_float16) -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint16_as_float16, simde_float16, uint16_t) - -#if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_PORTABLE - #define SIMDE_NANHF simde_uint16_as_float16(0x7E00) // a quiet Not-a-Number - #define SIMDE_INFINITYHF simde_uint16_as_float16(0x7C00) - #define SIMDE_NINFINITYHF simde_uint16_as_float16(0xFC00) -#else - #if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI - #if SIMDE_MATH_BUILTIN_LIBM(nanf16) - #define SIMDE_NANHF SIMDE_FLOAT16_C(__builtin_nanf16("")) - #elif defined(SIMDE_MATH_NAN) - #define SIMDE_NANHF SIMDE_FLOAT16_C(SIMDE_MATH_NAN) - #endif - #if SIMDE_MATH_BUILTIN_LIBM(inf16) - #define SIMDE_INFINITYHF SIMDE_FLOAT16_C(__builtin_inf16()) - #define SIMDE_NINFINITYHF SIMDE_FLOAT16_C(-__builtin_inf16()) - #else - #define SIMDE_INFINITYHF SIMDE_FLOAT16_C(SIMDE_MATH_INFINITY) - #define SIMDE_NINFINITYHF SIMDE_FLOAT16_C(-SIMDE_MATH_INFINITY) - #endif - #else - #if SIMDE_MATH_BUILTIN_LIBM(nanf16) - #define SIMDE_NANHF __builtin_nanf16("") - #elif defined(SIMDE_MATH_NAN) - #define SIMDE_NANHF SIMDE_MATH_NAN - #endif - #if SIMDE_MATH_BUILTIN_LIBM(inf16) - #define SIMDE_INFINITYHF __builtin_inf16() - #define SIMDE_NINFINITYHF -(__builtin_inf16()) - #else - #define SIMDE_INFINITYHF HEDLEY_STATIC_CAST(simde_float16, SIMDE_MATH_INFINITY) - #define SIMDE_NINFINITYHF HEDLEY_STATIC_CAST(simde_float16, -SIMDE_MATH_INFINITY) - #endif - #endif -#endif - -/* Conversion -- convert between single-precision and half-precision - * floats. */ -static HEDLEY_ALWAYS_INLINE HEDLEY_CONST -simde_float16 -simde_float16_from_float32 (simde_float32 value) { - simde_float16 res; - - #if \ - (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16) || \ - (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) - res = HEDLEY_STATIC_CAST(simde_float16, value); - #elif (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI) - res.value = HEDLEY_STATIC_CAST(__fp16, value); - #else - /* This code is CC0, based heavily on code by Fabian Giesen. */ - uint32_t f32u = simde_float32_as_uint32(value); - static const uint32_t f32u_infty = UINT32_C(255) << 23; - static const uint32_t f16u_max = (UINT32_C(127) + UINT32_C(16)) << 23; - static const uint32_t denorm_magic = - ((UINT32_C(127) - UINT32_C(15)) + (UINT32_C(23) - UINT32_C(10)) + UINT32_C(1)) << 23; - uint16_t f16u; - - uint32_t sign = f32u & (UINT32_C(1) << 31); - f32u ^= sign; - - /* NOTE all the integer compares in this function cast the operands - * to signed values to help compilers vectorize to SSE2, which lacks - * unsigned comparison instructions. This is fine since all - * operands are below 0x80000000 (we clear the sign bit). */ - - if (f32u > f16u_max) { /* result is Inf or NaN (all exponent bits set) */ - f16u = (f32u > f32u_infty) ? UINT32_C(0x7e00) : UINT32_C(0x7c00); /* NaN->qNaN and Inf->Inf */ - } else { /* (De)normalized number or zero */ - if (f32u < (UINT32_C(113) << 23)) { /* resulting FP16 is subnormal or zero */ - /* use a magic value to align our 10 mantissa bits at the bottom of - * the float. as long as FP addition is round-to-nearest-even this - * just works. */ - f32u = simde_float32_as_uint32(simde_uint32_as_float32(f32u) + simde_uint32_as_float32(denorm_magic)); - - /* and one integer subtract of the bias later, we have our final float! */ - f16u = HEDLEY_STATIC_CAST(uint16_t, f32u - denorm_magic); - } else { - uint32_t mant_odd = (f32u >> 13) & 1; - - /* update exponent, rounding bias part 1 */ - f32u += (HEDLEY_STATIC_CAST(uint32_t, 15 - 127) << 23) + UINT32_C(0xfff); - /* rounding bias part 2 */ - f32u += mant_odd; - /* take the bits! */ - f16u = HEDLEY_STATIC_CAST(uint16_t, f32u >> 13); - } - } - - f16u |= sign >> 16; - res = simde_uint16_as_float16(f16u); - #endif - - return res; -} - -static HEDLEY_ALWAYS_INLINE HEDLEY_CONST -simde_float32 -simde_float16_to_float32 (simde_float16 value) { - simde_float32 res; - - #if defined(SIMDE_FLOAT16_FLOAT16) || defined(SIMDE_FLOAT16_FP16) - res = HEDLEY_STATIC_CAST(simde_float32, value); - #else - /* This code is CC0, based heavily on code by Fabian Giesen. */ - uint16_t half = simde_float16_as_uint16(value); - const simde_float32 denorm_magic = simde_uint32_as_float32((UINT32_C(113) << 23)); - const uint32_t shifted_exp = UINT32_C(0x7c00) << 13; /* exponent mask after shift */ - uint32_t f32u; - - f32u = (half & UINT32_C(0x7fff)) << 13; /* exponent/mantissa bits */ - uint32_t exp = shifted_exp & f32u; /* just the exponent */ - f32u += (UINT32_C(127) - UINT32_C(15)) << 23; /* exponent adjust */ - - /* handle exponent special cases */ - if (exp == shifted_exp) /* Inf/NaN? */ - f32u += (UINT32_C(128) - UINT32_C(16)) << 23; /* extra exp adjust */ - else if (exp == 0) { /* Zero/Denormal? */ - f32u += (1) << 23; /* extra exp adjust */ - f32u = simde_float32_as_uint32(simde_uint32_as_float32(f32u) - denorm_magic); /* renormalize */ - } - - f32u |= (half & UINT32_C(0x8000)) << 16; /* sign bit */ - res = simde_uint32_as_float32(f32u); - #endif - - return res; -} - -#ifdef SIMDE_FLOAT16_C - #define SIMDE_FLOAT16_VALUE(value) SIMDE_FLOAT16_C(value) -#else - #define SIMDE_FLOAT16_VALUE(value) simde_float16_from_float32(SIMDE_FLOAT32_C(value)) -#endif - -#if !defined(simde_isinfhf) && defined(simde_math_isinff) - #define simde_isinfhf(a) simde_math_isinff(simde_float16_to_float32(a)) -#endif -#if !defined(simde_isnanhf) && defined(simde_math_isnanf) - #define simde_isnanhf(a) simde_math_isnanf(simde_float16_to_float32(a)) -#endif -#if !defined(simde_isnormalhf) && defined(simde_math_isnormalf) - #define simde_isnormalhf(a) simde_math_isnormalf(simde_float16_to_float32(a)) -#endif -#if !defined(simde_issubnormalhf) && defined(simde_math_issubnormalf) - #define simde_issubnormalhf(a) simde_math_issubnormalf(simde_float16_to_float32(a)) -#endif - -#define simde_fpclassifyhf(a) simde_math_fpclassifyf(simde_float16_to_float32(a)) - -static HEDLEY_INLINE -uint8_t -simde_fpclasshf(simde_float16 v, const int imm8) { - uint16_t bits = simde_float16_as_uint16(v); - uint8_t negative = (bits >> 15) & 1; - uint16_t const ExpMask = 0x7C00; // [14:10] - uint16_t const MantMask = 0x03FF; // [9:0] - uint8_t exponent_all_ones = ((bits & ExpMask) == ExpMask); - uint8_t exponent_all_zeros = ((bits & ExpMask) == 0); - uint8_t mantissa_all_zeros = ((bits & MantMask) == 0); - uint8_t zero = exponent_all_zeros & mantissa_all_zeros; - uint8_t signaling_bit = (bits >> 9) & 1; - - uint8_t result = 0; - uint8_t snan = exponent_all_ones & (!mantissa_all_zeros) & (!signaling_bit); - uint8_t qnan = exponent_all_ones & (!mantissa_all_zeros) & signaling_bit; - uint8_t positive_zero = (!negative) & zero; - uint8_t negative_zero = negative & zero; - uint8_t positive_infinity = (!negative) & exponent_all_ones & mantissa_all_zeros; - uint8_t negative_infinity = negative & exponent_all_ones & mantissa_all_zeros; - uint8_t denormal = exponent_all_zeros & (!mantissa_all_zeros); - uint8_t finite_negative = negative & (!exponent_all_ones) & (!zero); - result = (((imm8 >> 0) & qnan) | \ - ((imm8 >> 1) & positive_zero) | \ - ((imm8 >> 2) & negative_zero) | \ - ((imm8 >> 3) & positive_infinity) | \ - ((imm8 >> 4) & negative_infinity) | \ - ((imm8 >> 5) & denormal) | \ - ((imm8 >> 6) & finite_negative) | \ - ((imm8 >> 7) & snan)); - return result; -} - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_FLOAT16_H) */ diff --git a/extern/simde/simde-features.h b/extern/simde/simde-features.h deleted file mode 100644 index 622d12908..000000000 --- a/extern/simde/simde-features.h +++ /dev/null @@ -1,752 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -/* simde-arch.h is used to determine which features are available according - to the compiler. However, we want to make it possible to forcibly enable - or disable APIs */ - -#if !defined(SIMDE_FEATURES_H) -#define SIMDE_FEATURES_H - -#include "simde-arch.h" -#include "simde-diagnostic.h" - -#if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SVML) - #define SIMDE_X86_SVML_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && !defined(SIMDE_X86_AVX512VP2INTERSECT_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VP2INTERSECT) - #define SIMDE_X86_AVX512VP2INTERSECT_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && !defined(SIMDE_X86_AVX512VPOPCNTDQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VPOPCNTDQ) - #define SIMDE_X86_AVX512VPOPCNTDQ_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512BITALG_NATIVE) && !defined(SIMDE_X86_AVX512BITALG_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512BITALG) - #define SIMDE_X86_AVX512BITALG_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VBMI_NATIVE) && !defined(SIMDE_X86_AVX512VBMI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VBMI) - #define SIMDE_X86_AVX512VBMI_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VBMI2_NATIVE) && !defined(SIMDE_X86_AVX512VBMI2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VBMI2) - #define SIMDE_X86_AVX512VBMI2_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VBMI2_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VNNI_NATIVE) && !defined(SIMDE_X86_AVX512VNNI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VNNI) - #define SIMDE_X86_AVX512VNNI_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VNNI_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX5124VNNIW_NATIVE) && !defined(SIMDE_X86_AVX5124VNNIW_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX5124VNNIW) - #define SIMDE_X86_AVX5124VNNIW_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX5124VNNIW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512CD_NATIVE) && !defined(SIMDE_X86_AVX512CD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512CD) - #define SIMDE_X86_AVX512CD_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512CD_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512DQ_NATIVE) && !defined(SIMDE_X86_AVX512DQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512DQ) - #define SIMDE_X86_AVX512DQ_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_X86_AVX512VL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VL) - #define SIMDE_X86_AVX512VL_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512BW_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512BW) - #define SIMDE_X86_AVX512BW_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512FP16_NATIVE) && !defined(SIMDE_X86_AVX512FP16_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512FP16) - #define SIMDE_X86_AVX512FP16_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512BF16_NATIVE) && !defined(SIMDE_X86_AVX512BF16_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512BF16) - #define SIMDE_X86_AVX512BF16_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512BF16_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_X86_AVX512F_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512F) - #define SIMDE_X86_AVX512F_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_X86_AVX2_NATIVE) - #define SIMDE_X86_AVX2_NATIVE -#endif - -#if !defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_FMA_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_FMA) - #define SIMDE_X86_FMA_NATIVE - #endif -#endif -#if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE) - #define SIMDE_X86_AVX_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX2) - #define SIMDE_X86_AVX2_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE) - #define SIMDE_X86_AVX_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_AVX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX) - #define SIMDE_X86_AVX_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_SSE4_2_NATIVE) - #define SIMDE_X86_SSE4_2_NATIVE -#endif - -#if !defined(SIMDE_X86_XOP_NATIVE) && !defined(SIMDE_X86_XOP_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_XOP) - #define SIMDE_X86_XOP_NATIVE - #endif -#endif -#if defined(SIMDE_X86_XOP_NATIVE) && !defined(SIMDE_X86_SSE4_2_NATIVE) - #define SIMDE_X86_SSE4_2_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE4_2_NATIVE) && !defined(SIMDE_X86_SSE4_2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE4_2) - #define SIMDE_X86_SSE4_2_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSE4_2_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE) - #define SIMDE_X86_SSE4_1_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_X86_SSE4_1_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE4_1) - #define SIMDE_X86_SSE4_1_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_X86_SSSE3_NATIVE) - #define SIMDE_X86_SSSE3_NATIVE -#endif - -#if !defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSSE3_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSSE3) - #define SIMDE_X86_SSSE3_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NATIVE) - #define SIMDE_X86_SSE3_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE3) - #define SIMDE_X86_SSE3_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE2_NATIVE) - #define SIMDE_X86_SSE2_NATIVE -#endif - -#if !defined(SIMDE_X86_AES_NATIVE) && !defined(SIMDE_X86_AES_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AES) - #define SIMDE_X86_AES_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AES_NATIVE) && !defined(SIMDE_X86_SSE2_NATIVE) - #define SIMDE_X86_SSE2_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE2) - #define SIMDE_X86_SSE2_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE_NATIVE) - #define SIMDE_X86_SSE_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_SSE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE) - #define SIMDE_X86_SSE_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_X86_MMX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_MMX) - #define SIMDE_X86_MMX_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_GFNI_NATIVE) && !defined(SIMDE_X86_GFNI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_GFNI) - #define SIMDE_X86_GFNI_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_PCLMUL_NATIVE) && !defined(SIMDE_X86_PCLMUL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_PCLMUL) - #define SIMDE_X86_PCLMUL_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && !defined(SIMDE_X86_VPCLMULQDQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_VPCLMULQDQ) - #define SIMDE_X86_VPCLMULQDQ_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_F16C_NATIVE) && !defined(SIMDE_X86_F16C_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_F16C) - #define SIMDE_X86_F16C_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86) && (defined(__INTEL_COMPILER) || (HEDLEY_MSVC_VERSION_CHECK(14, 20, 0) && !defined(__clang__))) - #define SIMDE_X86_SVML_NATIVE - #endif -#endif - -#if defined(HEDLEY_MSVC_VERSION) - #pragma warning(push) - #pragma warning(disable:4799) -#endif - -#if \ - defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) - #include -#elif defined(SIMDE_X86_SSE4_2_NATIVE) - #include -#elif defined(SIMDE_X86_SSE4_1_NATIVE) - #include -#elif defined(SIMDE_X86_SSSE3_NATIVE) - #include -#elif defined(SIMDE_X86_SSE3_NATIVE) - #include -#elif defined(SIMDE_X86_SSE2_NATIVE) - #include -#elif defined(SIMDE_X86_SSE_NATIVE) - #include -#elif defined(SIMDE_X86_MMX_NATIVE) - #include -#endif - -#if defined(SIMDE_X86_XOP_NATIVE) - #if defined(_MSC_VER) - #include - #else - #include - #endif -#endif - -#if defined(SIMDE_X86_AES_NATIVE) - #include -#endif - -#if defined(HEDLEY_MSVC_VERSION) - #pragma warning(pop) -#endif - -#if !defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_ARM_NEON_A64V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_ARM_NEON) && defined(SIMDE_ARCH_AARCH64) && SIMDE_ARCH_ARM_CHECK(8,0) - #define SIMDE_ARM_NEON_A64V8_NATIVE - #endif -#endif -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NATIVE) - #define SIMDE_ARM_NEON_A32V8_NATIVE -#endif - -#if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(8,0) && (__ARM_NEON_FP & 0x02) - #define SIMDE_ARM_NEON_A32V8_NATIVE - #endif -#endif -#if defined(__ARM_ACLE) - #include -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define SIMDE_ARM_NEON_A32V7_NATIVE -#endif - -#if !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A32V7_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(7,0) - #define SIMDE_ARM_NEON_A32V7_NATIVE - #endif -#endif -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #include - #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - #include - #endif -#endif - -#if !defined(SIMDE_ARM_SVE_NATIVE) && !defined(SIMDE_ARM_SVE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_ARM_SVE) - #define SIMDE_ARM_SVE_NATIVE - #include - #endif -#endif - -#if !defined(SIMDE_WASM_SIMD128_NATIVE) && !defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_WASM_SIMD128) - #define SIMDE_WASM_SIMD128_NATIVE - #endif -#endif - -#if !defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) && !defined(SIMDE_WASM_RELAXED_SIMD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_WASM_RELAXED_SIMD) - #define SIMDE_WASM_RELAXED_SIMD_NATIVE - #endif -#endif -#if defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - #include -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P9_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(900) - #define SIMDE_POWER_ALTIVEC_P9_NATIVE - #endif -#endif -#if defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P8) - #define SIMDE_POWER_ALTIVEC_P8_NATIVE -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(800) - #define SIMDE_POWER_ALTIVEC_P8_NATIVE - #endif -#endif -#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P7) - #define SIMDE_POWER_ALTIVEC_P7_NATIVE -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P7_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(700) - #define SIMDE_POWER_ALTIVEC_P7_NATIVE - #endif -#endif -#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P6) - #define SIMDE_POWER_ALTIVEC_P6_NATIVE -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P6_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(600) - #define SIMDE_POWER_ALTIVEC_P6_NATIVE - #endif -#endif -#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P5) - #define SIMDE_POWER_ALTIVEC_P5_NATIVE -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P5_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(500) - #define SIMDE_POWER_ALTIVEC_P5_NATIVE - #endif -#endif - -#if !defined(SIMDE_ZARCH_ZVECTOR_15_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_15_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_ZARCH_CHECK(13) && defined(SIMDE_ARCH_ZARCH_ZVECTOR) - #define SIMDE_ZARCH_ZVECTOR_15_NATIVE - #endif -#endif - -#if !defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_14_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_ZARCH_CHECK(12) && defined(SIMDE_ARCH_ZARCH_ZVECTOR) - #define SIMDE_ZARCH_ZVECTOR_14_NATIVE - #endif -#endif - -#if !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_ZARCH_CHECK(11) && defined(SIMDE_ARCH_ZARCH_ZVECTOR) - #define SIMDE_ZARCH_ZVECTOR_13_NATIVE - #endif -#endif - -#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - /* AltiVec conflicts with lots of stuff. The bool keyword conflicts - * with the bool keyword in C++ and the bool macro in C99+ (defined - * in stdbool.h). The vector keyword conflicts with std::vector in - * C++ if you are `using std;`. - * - * Luckily AltiVec allows you to use `__vector`/`__bool`/`__pixel` - * instead, but altivec.h will unconditionally define - * `vector`/`bool`/`pixel` so we need to work around that. - * - * Unfortunately this means that if your code uses AltiVec directly - * it may break. If this is the case you'll want to define - * `SIMDE_POWER_ALTIVEC_NO_UNDEF` before including SIMDe. Or, even - * better, port your code to use the double-underscore versions. */ - #if defined(bool) - #undef bool - #endif - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #include - - #if !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF) - #if defined(vector) - #undef vector - #endif - #if defined(pixel) - #undef pixel - #endif - #if defined(bool) - #undef bool - #endif - #endif /* !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF) */ - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - #include - #endif - - /* Use these intsead of vector/pixel/bool in SIMDe. */ - #define SIMDE_POWER_ALTIVEC_VECTOR(T) __vector T - #define SIMDE_POWER_ALTIVEC_PIXEL __pixel - #define SIMDE_POWER_ALTIVEC_BOOL __bool - - /* Re-define bool if we're using stdbool.h */ - #if !defined(__cplusplus) && defined(__bool_true_false_are_defined) && !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF) - #define bool _Bool - #endif -#endif - -#if !defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) && !defined(SIMDE_MIPS_LOONGSON_MMI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_MIPS_LOONGSON_MMI) - #define SIMDE_MIPS_LOONGSON_MMI_NATIVE 1 - #endif -#endif -#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - #include -#endif - -#if !defined(SIMDE_MIPS_MSA_NATIVE) && !defined(SIMDE_MIPS_MSA_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_MIPS_MSA) - #define SIMDE_MIPS_MSA_NATIVE 1 - #endif -#endif -#if defined(SIMDE_MIPS_MSA_NATIVE) - #include -#endif - -/* This is used to determine whether or not to fall back on a vector - * function in an earlier ISA extensions, as well as whether - * we expected any attempts at vectorization to be fruitful or if we - * expect to always be running serial code. - * - * Note that, for some architectures (okay, *one* architecture) there - * can be a split where some types are supported for one vector length - * but others only for a shorter length. Therefore, it is possible to - * provide separate values for float/int/double types. */ - -#if !defined(SIMDE_NATURAL_VECTOR_SIZE) - #if defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_NATURAL_VECTOR_SIZE (512) - #elif defined(SIMDE_X86_AVX2_NATIVE) - #define SIMDE_NATURAL_VECTOR_SIZE (256) - #elif defined(SIMDE_X86_AVX_NATIVE) - #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE (256) - #define SIMDE_NATURAL_INT_VECTOR_SIZE (128) - #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE (128) - #elif \ - defined(SIMDE_X86_SSE2_NATIVE) || \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || \ - defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || \ - defined(SIMDE_MIPS_MSA_NATIVE) - #define SIMDE_NATURAL_VECTOR_SIZE (128) - #elif defined(SIMDE_X86_SSE_NATIVE) - #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE (128) - #define SIMDE_NATURAL_INT_VECTOR_SIZE (64) - #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE (0) - #endif - - #if !defined(SIMDE_NATURAL_VECTOR_SIZE) - #if defined(SIMDE_NATURAL_FLOAT_VECTOR_SIZE) - #define SIMDE_NATURAL_VECTOR_SIZE SIMDE_NATURAL_FLOAT_VECTOR_SIZE - #elif defined(SIMDE_NATURAL_INT_VECTOR_SIZE) - #define SIMDE_NATURAL_VECTOR_SIZE SIMDE_NATURAL_INT_VECTOR_SIZE - #elif defined(SIMDE_NATURAL_DOUBLE_VECTOR_SIZE) - #define SIMDE_NATURAL_VECTOR_SIZE SIMDE_NATURAL_DOUBLE_VECTOR_SIZE - #else - #define SIMDE_NATURAL_VECTOR_SIZE (0) - #endif - #endif - - #if !defined(SIMDE_NATURAL_FLOAT_VECTOR_SIZE) - #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE SIMDE_NATURAL_VECTOR_SIZE - #endif - #if !defined(SIMDE_NATURAL_INT_VECTOR_SIZE) - #define SIMDE_NATURAL_INT_VECTOR_SIZE SIMDE_NATURAL_VECTOR_SIZE - #endif - #if !defined(SIMDE_NATURAL_DOUBLE_VECTOR_SIZE) - #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE SIMDE_NATURAL_VECTOR_SIZE - #endif -#endif - -#define SIMDE_NATURAL_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE <= (x))) -#define SIMDE_NATURAL_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE >= (x))) -#define SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_FLOAT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_FLOAT_VECTOR_SIZE <= (x))) -#define SIMDE_NATURAL_FLOAT_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_FLOAT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_FLOAT_VECTOR_SIZE >= (x))) -#define SIMDE_NATURAL_INT_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_INT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_INT_VECTOR_SIZE <= (x))) -#define SIMDE_NATURAL_INT_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_INT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_INT_VECTOR_SIZE >= (x))) -#define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_DOUBLE_VECTOR_SIZE > 0) && (SIMDE_NATURAL_DOUBLE_VECTOR_SIZE <= (x))) -#define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_DOUBLE_VECTOR_SIZE > 0) && (SIMDE_NATURAL_DOUBLE_VECTOR_SIZE >= (x))) - -/* Native aliases */ -#if defined(SIMDE_ENABLE_NATIVE_ALIASES) - #if !defined(SIMDE_X86_MMX_NATIVE) - #define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE_NATIVE) - #define SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE2_NATIVE) - #define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE3_NATIVE) - #define SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSSE3_NATIVE) - #define SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE4_1_NATIVE) - #define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE4_2_NATIVE) - #define SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX_NATIVE) - #define SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX2_NATIVE) - #define SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_FMA_NATIVE) - #define SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VL_NATIVE) - #define SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VBMI_NATIVE) - #define SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VBMI2_NATIVE) - #define SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512BW_NATIVE) - #define SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VNNI_NATIVE) - #define SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX5124VNNIW_NATIVE) - #define SIMDE_X86_AVX5124VNNIW_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512BF16_NATIVE) - #define SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512BITALG_NATIVE) - #define SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) - #define SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) - #define SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512DQ_NATIVE) - #define SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512CD_NATIVE) - #define SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512FP16_NATIVE) - #define SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_GFNI_NATIVE) - #define SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_PCLMUL_NATIVE) - #define SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE) - #define SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_F16C_NATIVE) - #define SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AES_NATIVE) - #define SIMDE_X86_AES_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SVML_NATIVE) - #define SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES - #endif - - #if !defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) - #define SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #define SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES - #endif - - #if !defined(SIMDE_ARM_SVE_NATIVE) - #define SIMDE_ARM_SVE_ENABLE_NATIVE_ALIASES - #endif - - #if !defined(SIMDE_MIPS_MSA_NATIVE) - #define SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES - #endif - - #if !defined(SIMDE_WASM_SIMD128_NATIVE) - #define SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES - #endif -#endif - -/* Are floating point values stored using IEEE 754? Knowing - * this at during preprocessing is a bit tricky, mostly because what - * we're curious about is how values are stored and not whether the - * implementation is fully conformant in terms of rounding, NaN - * handling, etc. - * - * For example, if you use -ffast-math or -Ofast on - * GCC or clang IEEE 754 isn't strictly followed, therefore IEE 754 - * support is not advertised (by defining __STDC_IEC_559__). - * - * However, what we care about is whether it is safe to assume that - * floating point values are stored in IEEE 754 format, in which case - * we can provide faster implementations of some functions. - * - * Luckily every vaugely modern architecture I'm aware of uses IEEE 754- - * so we just assume IEEE 754 for now. There is a test which verifies - * this, if that test fails sowewhere please let us know and we'll add - * an exception for that platform. Meanwhile, you can define - * SIMDE_NO_IEEE754_STORAGE. */ -#if !defined(SIMDE_IEEE754_STORAGE) && !defined(SIMDE_NO_IEE754_STORAGE) - #define SIMDE_IEEE754_STORAGE -#endif - -#if defined(SIMDE_ARCH_ARM_NEON_FP16) - #define SIMDE_ARM_NEON_FP16 -#endif - -#if defined(SIMDE_ARCH_ARM_NEON_BF16) - #define SIMDE_ARM_NEON_BF16 -#endif - -#if !defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_LOONGARCH_LASX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_LOONGARCH_LASX) - #define SIMDE_LOONGARCH_LASX_NATIVE - #endif -#endif - -#if !defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_LOONGARCH_LSX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_LOONGARCH_LSX) - #define SIMDE_LOONGARCH_LSX_NATIVE - #endif -#endif - -#if defined(SIMDE_LOONGARCH_LASX_NATIVE) - #include -#endif -#if defined(SIMDE_LOONGARCH_LSX_NATIVE) - #include -#endif - -#endif /* !defined(SIMDE_FEATURES_H) */ diff --git a/extern/simde/simde-math.h b/extern/simde/simde-math.h deleted file mode 100644 index 02de568dc..000000000 --- a/extern/simde/simde-math.h +++ /dev/null @@ -1,2065 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) - */ - -/* Attempt to find math functions. Functions may be in , - * , compiler built-ins/intrinsics, or platform/architecture - * specific headers. In some cases, especially those not built in to - * libm, we may need to define our own implementations. */ - -#if !defined(SIMDE_MATH_H) -#define SIMDE_MATH_H 1 - -#include "hedley.h" -#include "simde-features.h" - -#include -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #include -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS - -/* SLEEF support - * https://sleef.org/ - * - * If you include prior to including SIMDe, SIMDe will use - * SLEEF. You can also define SIMDE_MATH_SLEEF_ENABLE prior to - * including SIMDe to force the issue. - * - * Note that SLEEF does requires linking to libsleef. - * - * By default, SIMDe will use the 1 ULP functions, but if you use - * SIMDE_ACCURACY_PREFERENCE of 0 we will use up to 4 ULP. This is - * only the case for the simde_math_* functions; for code in other - * SIMDe headers which calls SLEEF directly we may use functions with - * greater error if the API we're implementing is less precise (for - * example, SVML guarantees 4 ULP, so we will generally use the 3.5 - * ULP functions from SLEEF). */ -#if !defined(SIMDE_MATH_SLEEF_DISABLE) - #if defined(__SLEEF_H__) - #define SIMDE_MATH_SLEEF_ENABLE - #endif -#endif - -#if defined(SIMDE_MATH_SLEEF_ENABLE) && !defined(__SLEEF_H__) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ - #include - HEDLEY_DIAGNOSTIC_POP -#endif - -#if defined(SIMDE_MATH_SLEEF_ENABLE) && defined(__SLEEF_H__) - #if defined(SLEEF_VERSION_MAJOR) - #define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (HEDLEY_VERSION_ENCODE(SLEEF_VERSION_MAJOR, SLEEF_VERSION_MINOR, SLEEF_VERSION_PATCHLEVEL) >= HEDLEY_VERSION_ENCODE(major, minor, patch)) - #else - #define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (HEDLEY_VERSION_ENCODE(3,0,0) >= HEDLEY_VERSION_ENCODE(major, minor, patch)) - #endif -#else - #define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (0) -#endif - -#if defined(__has_builtin) - #define SIMDE_MATH_BUILTIN_LIBM(func) __has_builtin(__builtin_##func) -#elif \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_GCC_VERSION_CHECK(4,4,0) - #define SIMDE_MATH_BUILTIN_LIBM(func) (1) -#else - #define SIMDE_MATH_BUILTIN_LIBM(func) (0) -#endif - -#if defined(HUGE_VAL) - /* Looks like or has already been included. */ - - /* The math.h from libc++ (yes, the C header from the C++ standard - * library) will define an isnan function, but not an isnan macro - * like the C standard requires. So we detect the header guards - * macro libc++ uses. */ - #if defined(isnan) || (defined(_LIBCPP_MATH_H) && !defined(_LIBCPP_CMATH)) - #define SIMDE_MATH_HAVE_MATH_H - #elif defined(__cplusplus) - #define SIMDE_MATH_HAVE_CMATH - #endif -#elif defined(__has_include) - #if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include() - #define SIMDE_MATH_HAVE_CMATH - #include - #elif __has_include() - #define SIMDE_MATH_HAVE_MATH_H - #include - #elif !defined(SIMDE_MATH_NO_LIBM) - #define SIMDE_MATH_NO_LIBM - #endif -#elif !defined(SIMDE_MATH_NO_LIBM) - #if defined(__cplusplus) && (__cplusplus >= 201103L) - #define SIMDE_MATH_HAVE_CMATH - HEDLEY_DIAGNOSTIC_PUSH - #if defined(HEDLEY_MSVC_VERSION) - /* VS 14 emits this diagnostic about noexcept being used on a - * function, which we can't do anything about. */ - #pragma warning(disable:4996) - #endif - #include - HEDLEY_DIAGNOSTIC_POP - #else - #define SIMDE_MATH_HAVE_MATH_H - #include - #endif -#endif - -#if !defined(SIMDE_MATH_INFINITY) - #if \ - HEDLEY_HAS_BUILTIN(__builtin_inf) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) - #define SIMDE_MATH_INFINITY (__builtin_inf()) - #elif defined(INFINITY) - #define SIMDE_MATH_INFINITY INFINITY - #endif -#endif - -#if !defined(SIMDE_INFINITYF) - #if \ - HEDLEY_HAS_BUILTIN(__builtin_inff) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) - #define SIMDE_MATH_INFINITYF (__builtin_inff()) - #elif defined(INFINITYF) - #define SIMDE_MATH_INFINITYF INFINITYF - #elif defined(SIMDE_MATH_INFINITY) - #define SIMDE_MATH_INFINITYF HEDLEY_STATIC_CAST(float, SIMDE_MATH_INFINITY) - #endif -#endif - -#if !defined(SIMDE_MATH_NAN) - #if \ - HEDLEY_HAS_BUILTIN(__builtin_nan) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) - #define SIMDE_MATH_NAN (__builtin_nan("")) - #elif defined(NAN) - #define SIMDE_MATH_NAN NAN - #endif -#endif - -#if !defined(SIMDE_NANF) - #if \ - HEDLEY_HAS_BUILTIN(__builtin_nanf) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) - #define SIMDE_MATH_NANF (__builtin_nanf("")) - #elif defined(NANF) - #define SIMDE_MATH_NANF NANF - #elif defined(SIMDE_MATH_NAN) - #define SIMDE_MATH_NANF HEDLEY_STATIC_CAST(float, SIMDE_MATH_NAN) - #endif -#endif - -#if !defined(SIMDE_MATH_PI) - #if defined(M_PI) - #define SIMDE_MATH_PI M_PI - #else - #define SIMDE_MATH_PI 3.14159265358979323846 - #endif -#endif - -#if !defined(SIMDE_MATH_PIF) - #if defined(M_PI) - #define SIMDE_MATH_PIF HEDLEY_STATIC_CAST(float, M_PI) - #else - #define SIMDE_MATH_PIF 3.14159265358979323846f - #endif -#endif - -#if !defined(SIMDE_MATH_PI_OVER_180) - #define SIMDE_MATH_PI_OVER_180 0.0174532925199432957692369076848861271344287188854172545609719144 -#endif - -#if !defined(SIMDE_MATH_PI_OVER_180F) - #define SIMDE_MATH_PI_OVER_180F 0.0174532925199432957692369076848861271344287188854172545609719144f -#endif - -#if !defined(SIMDE_MATH_180_OVER_PI) - #define SIMDE_MATH_180_OVER_PI 57.295779513082320876798154814105170332405472466564321549160243861 -#endif - -#if !defined(SIMDE_MATH_180_OVER_PIF) - #define SIMDE_MATH_180_OVER_PIF 57.295779513082320876798154814105170332405472466564321549160243861f -#endif - -#if !defined(SIMDE_MATH_FLT_MIN) - #if defined(__FLT_MIN__) - #define SIMDE_MATH_FLT_MIN __FLT_MIN__ - #else - #if !defined(FLT_MIN) - #if defined(__cplusplus) - #include - #else - #include - #endif - #endif - #define SIMDE_MATH_FLT_MIN FLT_MIN - #endif -#endif - -#if !defined(SIMDE_MATH_FLT_MAX) - #if defined(__FLT_MAX__) - #define SIMDE_MATH_FLT_MAX __FLT_MAX__ - #else - #if !defined(FLT_MAX) - #if defined(__cplusplus) - #include - #else - #include - #endif - #endif - #define SIMDE_MATH_FLT_MAX FLT_MAX - #endif -#endif - -#if !defined(SIMDE_MATH_DBL_MIN) - #if defined(__DBL_MIN__) - #define SIMDE_MATH_DBL_MIN __DBL_MIN__ - #else - #if !defined(DBL_MIN) - #if defined(__cplusplus) - #include - #else - #include - #endif - #endif - #define SIMDE_MATH_DBL_MIN DBL_MIN - #endif -#endif - -#if !defined(SIMDE_MATH_DBL_MAX) - #if defined(__DBL_MAX__) - #define SIMDE_MATH_DBL_MAX __DBL_MAX__ - #else - #if !defined(DBL_MAX) - #if defined(__cplusplus) - #include - #else - #include - #endif - #endif - #define SIMDE_MATH_DBL_MAX DBL_MAX - #endif -#endif - -/*** Classification macros from C99 ***/ - -#if !defined(simde_math_isinf) - #if SIMDE_MATH_BUILTIN_LIBM(isinf) - #define simde_math_isinf(v) __builtin_isinf(v) - #elif defined(isinf) || defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_isinf(v) isinf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isinf(v) std::isinf(v) - #endif -#endif - -#if !defined(simde_math_isinff) - #if HEDLEY_HAS_BUILTIN(__builtin_isinff) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) - #define simde_math_isinff(v) __builtin_isinff(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isinff(v) std::isinf(v) - #elif defined(simde_math_isinf) - #define simde_math_isinff(v) simde_math_isinf(HEDLEY_STATIC_CAST(double, v)) - #endif -#endif - -#if !defined(simde_math_isnan) - #if SIMDE_MATH_BUILTIN_LIBM(isnan) - #define simde_math_isnan(v) __builtin_isnan(v) - #elif defined(isnan) || defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_isnan(v) isnan(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isnan(v) std::isnan(v) - #endif -#endif - -#if !defined(simde_math_isnanf) - #if HEDLEY_HAS_BUILTIN(__builtin_isnanf) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) - /* XL C/C++ has __builtin_isnan but not __builtin_isnanf */ - #define simde_math_isnanf(v) __builtin_isnanf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isnanf(v) std::isnan(v) - #elif defined(simde_math_isnan) - #define simde_math_isnanf(v) simde_math_isnan(HEDLEY_STATIC_CAST(double, v)) - #endif -#endif - -#if !defined(simde_math_isnormal) - #if SIMDE_MATH_BUILTIN_LIBM(isnormal) - #define simde_math_isnormal(v) __builtin_isnormal(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_isnormal(v) isnormal(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isnormal(v) std::isnormal(v) - #endif -#endif - -#if !defined(simde_math_isnormalf) - #if HEDLEY_HAS_BUILTIN(__builtin_isnormalf) - #define simde_math_isnormalf(v) __builtin_isnormalf(v) - #elif SIMDE_MATH_BUILTIN_LIBM(isnormal) - #define simde_math_isnormalf(v) __builtin_isnormal(v) - #elif defined(isnormalf) - #define simde_math_isnormalf(v) isnormalf(v) - #elif defined(isnormal) || defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_isnormalf(v) isnormal(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isnormalf(v) std::isnormal(v) - #elif defined(simde_math_isnormal) - #define simde_math_isnormalf(v) simde_math_isnormal(v) - #endif -#endif - -#if !defined(simde_math_issubnormalf) - #if SIMDE_MATH_BUILTIN_LIBM(fpclassify) - #define simde_math_issubnormalf(v) __builtin_fpclassify(0, 0, 0, 1, 0, v) - #elif defined(fpclassify) - #define simde_math_issubnormalf(v) (fpclassify(v) == FP_SUBNORMAL) - #elif defined(SIMDE_IEEE754_STORAGE) - #define simde_math_issubnormalf(v) (((simde_float32_as_uint32(v) & UINT32_C(0x7F800000)) == UINT32_C(0)) && ((simde_float32_as_uint32(v) & UINT32_C(0x007FFFFF)) != UINT32_C(0))) - #endif -#endif - -#if !defined(simde_math_issubnormal) - #if SIMDE_MATH_BUILTIN_LIBM(fpclassify) - #define simde_math_issubnormal(v) __builtin_fpclassify(0, 0, 0, 1, 0, v) - #elif defined(fpclassify) - #define simde_math_issubnormal(v) (fpclassify(v) == FP_SUBNORMAL) - #elif defined(SIMDE_IEEE754_STORAGE) - #define simde_math_issubnormal(v) (((simde_float64_as_uint64(v) & UINT64_C(0x7FF0000000000000)) == UINT64_C(0)) && ((simde_float64_as_uint64(v) & UINT64_C(0x00FFFFFFFFFFFFF)) != UINT64_C(0))) - #endif -#endif - -#if defined(FP_NAN) - #define SIMDE_MATH_FP_NAN FP_NAN -#else - #define SIMDE_MATH_FP_NAN 0 -#endif -#if defined(FP_INFINITE) - #define SIMDE_MATH_FP_INFINITE FP_INFINITE -#else - #define SIMDE_MATH_FP_INFINITE 1 -#endif -#if defined(FP_ZERO) - #define SIMDE_MATH_FP_ZERO FP_ZERO -#else - #define SIMDE_MATH_FP_ZERO 2 -#endif -#if defined(FP_SUBNORMAL) - #define SIMDE_MATH_FP_SUBNORMAL FP_SUBNORMAL -#else - #define SIMDE_MATH_FP_SUBNORMAL 3 -#endif -#if defined(FP_NORMAL) - #define SIMDE_MATH_FP_NORMAL FP_NORMAL -#else - #define SIMDE_MATH_FP_NORMAL 4 -#endif - -static HEDLEY_INLINE -int -simde_math_fpclassifyf(float v) { - #if SIMDE_MATH_BUILTIN_LIBM(fpclassify) - return __builtin_fpclassify(SIMDE_MATH_FP_NAN, SIMDE_MATH_FP_INFINITE, SIMDE_MATH_FP_NORMAL, SIMDE_MATH_FP_SUBNORMAL, SIMDE_MATH_FP_ZERO, v); - #elif defined(fpclassify) - return fpclassify(v); - #else - return - simde_math_isnormalf(v) ? SIMDE_MATH_FP_NORMAL : - (v == 0.0f) ? SIMDE_MATH_FP_ZERO : - simde_math_isnanf(v) ? SIMDE_MATH_FP_NAN : - simde_math_isinff(v) ? SIMDE_MATH_FP_INFINITE : - SIMDE_MATH_FP_SUBNORMAL; - #endif -} - -static HEDLEY_INLINE -int -simde_math_fpclassify(double v) { - #if SIMDE_MATH_BUILTIN_LIBM(fpclassify) - return __builtin_fpclassify(SIMDE_MATH_FP_NAN, SIMDE_MATH_FP_INFINITE, SIMDE_MATH_FP_NORMAL, SIMDE_MATH_FP_SUBNORMAL, SIMDE_MATH_FP_ZERO, v); - #elif defined(fpclassify) - return fpclassify(v); - #else - return - simde_math_isnormal(v) ? SIMDE_MATH_FP_NORMAL : - (v == 0.0) ? SIMDE_MATH_FP_ZERO : - simde_math_isnan(v) ? SIMDE_MATH_FP_NAN : - simde_math_isinf(v) ? SIMDE_MATH_FP_INFINITE : - SIMDE_MATH_FP_SUBNORMAL; - #endif -} - -#define SIMDE_MATH_FP_QNAN 0x01 -#define SIMDE_MATH_FP_PZERO 0x02 -#define SIMDE_MATH_FP_NZERO 0x04 -#define SIMDE_MATH_FP_PINF 0x08 -#define SIMDE_MATH_FP_NINF 0x10 -#define SIMDE_MATH_FP_DENORMAL 0x20 -#define SIMDE_MATH_FP_NEGATIVE 0x40 -#define SIMDE_MATH_FP_SNAN 0x80 - -static HEDLEY_INLINE -uint8_t -simde_math_fpclassf(float v, const int imm8) { - union { - float f; - uint32_t u; - } fu; - fu.f = v; - uint32_t bits = fu.u; - uint8_t NegNum = (bits >> 31) & 1; - uint32_t const ExpMask = 0x3F800000; // [30:23] - uint32_t const MantMask = 0x007FFFFF; // [22:0] - uint8_t ExpAllOnes = ((bits & ExpMask) == ExpMask); - uint8_t ExpAllZeros = ((bits & ExpMask) == 0); - uint8_t MantAllZeros = ((bits & MantMask) == 0); - uint8_t ZeroNumber = ExpAllZeros & MantAllZeros; - uint8_t SignalingBit = (bits >> 22) & 1; - - uint8_t result = 0; - uint8_t qNaN_res = ExpAllOnes & (!MantAllZeros) & SignalingBit; - uint8_t Pzero_res = (!NegNum) & ExpAllZeros & MantAllZeros; - uint8_t Nzero_res = NegNum & ExpAllZeros & MantAllZeros; - uint8_t Pinf_res = (!NegNum) & ExpAllOnes & MantAllZeros; - uint8_t Ninf_res = NegNum & ExpAllOnes & MantAllZeros; - uint8_t Denorm_res = ExpAllZeros & (!MantAllZeros); - uint8_t FinNeg_res = NegNum & (!ExpAllOnes) & (!ZeroNumber); - uint8_t sNaN_res = ExpAllOnes & (!MantAllZeros) & (!SignalingBit); - result = (((imm8 >> 0) & qNaN_res) | \ - ((imm8 >> 1) & Pzero_res) | \ - ((imm8 >> 2) & Nzero_res) | \ - ((imm8 >> 3) & Pinf_res) | \ - ((imm8 >> 4) & Ninf_res) | \ - ((imm8 >> 5) & Denorm_res) | \ - ((imm8 >> 6) & FinNeg_res) | \ - ((imm8 >> 7) & sNaN_res)); - return result; -} - -static HEDLEY_INLINE -uint8_t -simde_math_fpclass(double v, const int imm8) { - union { - double d; - uint64_t u; - } du; - du.d = v; - uint64_t bits = du.u; - uint8_t NegNum = (bits >> 63) & 1; - uint64_t const ExpMask = 0x3FF0000000000000; // [62:52] - uint64_t const MantMask = 0x000FFFFFFFFFFFFF; // [51:0] - uint8_t ExpAllOnes = ((bits & ExpMask) == ExpMask); - uint8_t ExpAllZeros = ((bits & ExpMask) == 0); - uint8_t MantAllZeros = ((bits & MantMask) == 0); - uint8_t ZeroNumber = ExpAllZeros & MantAllZeros; - uint8_t SignalingBit = (bits >> 51) & 1; - - uint8_t result = 0; - uint8_t qNaN_res = ExpAllOnes & (!MantAllZeros) & SignalingBit; - uint8_t Pzero_res = (!NegNum) & ExpAllZeros & MantAllZeros; - uint8_t Nzero_res = NegNum & ExpAllZeros & MantAllZeros; - uint8_t Pinf_res = (!NegNum) & ExpAllOnes & MantAllZeros; - uint8_t Ninf_res = NegNum & ExpAllOnes & MantAllZeros; - uint8_t Denorm_res = ExpAllZeros & (!MantAllZeros); - uint8_t FinNeg_res = NegNum & (!ExpAllOnes) & (!ZeroNumber); - uint8_t sNaN_res = ExpAllOnes & (!MantAllZeros) & (!SignalingBit); - result = (((imm8 >> 0) & qNaN_res) | \ - ((imm8 >> 1) & Pzero_res) | \ - ((imm8 >> 2) & Nzero_res) | \ - ((imm8 >> 3) & Pinf_res) | \ - ((imm8 >> 4) & Ninf_res) | \ - ((imm8 >> 5) & Denorm_res) | \ - ((imm8 >> 6) & FinNeg_res) | \ - ((imm8 >> 7) & sNaN_res)); - return result; -} - -/*** Manipulation functions ***/ - -#if !defined(simde_math_nextafter) - #if \ - (HEDLEY_HAS_BUILTIN(__builtin_nextafter) && !defined(HEDLEY_IBM_VERSION)) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define simde_math_nextafter(x, y) __builtin_nextafter(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_nextafter(x, y) std::nextafter(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_nextafter(x, y) nextafter(x, y) - #endif -#endif - -#if !defined(simde_math_nextafterf) - #if \ - (HEDLEY_HAS_BUILTIN(__builtin_nextafterf) && !defined(HEDLEY_IBM_VERSION)) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define simde_math_nextafterf(x, y) __builtin_nextafterf(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_nextafterf(x, y) std::nextafter(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_nextafterf(x, y) nextafterf(x, y) - #endif -#endif - -/*** Functions from C99 ***/ - -#if !defined(simde_math_abs) - #if SIMDE_MATH_BUILTIN_LIBM(abs) - #define simde_math_abs(v) __builtin_abs(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_abs(v) std::abs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_abs(v) abs(v) - #endif -#endif - -#if !defined(simde_math_labs) - #if SIMDE_MATH_BUILTIN_LIBM(labs) - #define simde_math_labs(v) __builtin_labs(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_labs(v) std::labs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_labs(v) labs(v) - #endif -#endif - -#if !defined(simde_math_llabs) - #if SIMDE_MATH_BUILTIN_LIBM(llabs) - #define simde_math_llabs(v) __builtin_llabs(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_llabs(v) std::llabs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_llabs(v) llabs(v) - #endif -#endif - -#if !defined(simde_math_fabsf) - #if SIMDE_MATH_BUILTIN_LIBM(fabsf) - #define simde_math_fabsf(v) __builtin_fabsf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fabsf(v) std::abs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fabsf(v) fabsf(v) - #endif -#endif - -#if !defined(simde_math_acos) - #if SIMDE_MATH_BUILTIN_LIBM(acos) - #define simde_math_acos(v) __builtin_acos(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_acos(v) std::acos(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_acos(v) acos(v) - #endif -#endif - -#if !defined(simde_math_acosf) - #if SIMDE_MATH_BUILTIN_LIBM(acosf) - #define simde_math_acosf(v) __builtin_acosf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_acosf(v) std::acos(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_acosf(v) acosf(v) - #endif -#endif - -#if !defined(simde_math_acosh) - #if SIMDE_MATH_BUILTIN_LIBM(acosh) - #define simde_math_acosh(v) __builtin_acosh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_acosh(v) std::acosh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_acosh(v) acosh(v) - #endif -#endif - -#if !defined(simde_math_acoshf) - #if SIMDE_MATH_BUILTIN_LIBM(acoshf) - #define simde_math_acoshf(v) __builtin_acoshf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_acoshf(v) std::acosh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_acoshf(v) acoshf(v) - #endif -#endif - -#if !defined(simde_math_asin) - #if SIMDE_MATH_BUILTIN_LIBM(asin) - #define simde_math_asin(v) __builtin_asin(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_asin(v) std::asin(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_asin(v) asin(v) - #endif -#endif - -#if !defined(simde_math_asinf) - #if SIMDE_MATH_BUILTIN_LIBM(asinf) - #define simde_math_asinf(v) __builtin_asinf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_asinf(v) std::asin(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_asinf(v) asinf(v) - #endif -#endif - -#if !defined(simde_math_asinh) - #if SIMDE_MATH_BUILTIN_LIBM(asinh) - #define simde_math_asinh(v) __builtin_asinh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_asinh(v) std::asinh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_asinh(v) asinh(v) - #endif -#endif - -#if !defined(simde_math_asinhf) - #if SIMDE_MATH_BUILTIN_LIBM(asinhf) - #define simde_math_asinhf(v) __builtin_asinhf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_asinhf(v) std::asinh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_asinhf(v) asinhf(v) - #endif -#endif - -#if !defined(simde_math_atan) - #if SIMDE_MATH_BUILTIN_LIBM(atan) - #define simde_math_atan(v) __builtin_atan(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atan(v) std::atan(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atan(v) atan(v) - #endif -#endif - -#if !defined(simde_math_atan2) - #if SIMDE_MATH_BUILTIN_LIBM(atan2) - #define simde_math_atan2(y, x) __builtin_atan2(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atan2(y, x) std::atan2(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atan2(y, x) atan2(y, x) - #endif -#endif - -#if !defined(simde_math_atan2f) - #if SIMDE_MATH_BUILTIN_LIBM(atan2f) - #define simde_math_atan2f(y, x) __builtin_atan2f(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atan2f(y, x) std::atan2(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atan2f(y, x) atan2f(y, x) - #endif -#endif - -#if !defined(simde_math_atanf) - #if SIMDE_MATH_BUILTIN_LIBM(atanf) - #define simde_math_atanf(v) __builtin_atanf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atanf(v) std::atan(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atanf(v) atanf(v) - #endif -#endif - -#if !defined(simde_math_atanh) - #if SIMDE_MATH_BUILTIN_LIBM(atanh) - #define simde_math_atanh(v) __builtin_atanh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atanh(v) std::atanh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atanh(v) atanh(v) - #endif -#endif - -#if !defined(simde_math_atanhf) - #if SIMDE_MATH_BUILTIN_LIBM(atanhf) - #define simde_math_atanhf(v) __builtin_atanhf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atanhf(v) std::atanh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atanhf(v) atanhf(v) - #endif -#endif - -#if !defined(simde_math_cbrt) - #if SIMDE_MATH_BUILTIN_LIBM(cbrt) - #define simde_math_cbrt(v) __builtin_cbrt(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cbrt(v) std::cbrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cbrt(v) cbrt(v) - #endif -#endif - -#if !defined(simde_math_cbrtf) - #if SIMDE_MATH_BUILTIN_LIBM(cbrtf) - #define simde_math_cbrtf(v) __builtin_cbrtf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cbrtf(v) std::cbrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cbrtf(v) cbrtf(v) - #endif -#endif - -#if !defined(simde_math_ceil) - #if SIMDE_MATH_BUILTIN_LIBM(ceil) - #define simde_math_ceil(v) __builtin_ceil(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_ceil(v) std::ceil(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_ceil(v) ceil(v) - #endif -#endif - -#if !defined(simde_math_ceilf) - #if SIMDE_MATH_BUILTIN_LIBM(ceilf) - #define simde_math_ceilf(v) __builtin_ceilf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_ceilf(v) std::ceil(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_ceilf(v) ceilf(v) - #endif -#endif - -#if !defined(simde_math_copysign) - #if SIMDE_MATH_BUILTIN_LIBM(copysign) - #define simde_math_copysign(x, y) __builtin_copysign(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_copysign(x, y) std::copysign(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_copysign(x, y) copysign(x, y) - #endif -#endif - -#if !defined(simde_math_copysignf) - #if SIMDE_MATH_BUILTIN_LIBM(copysignf) - #define simde_math_copysignf(x, y) __builtin_copysignf(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_copysignf(x, y) std::copysignf(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_copysignf(x, y) copysignf(x, y) - #endif -#endif - -#if !defined(simde_math_signbit) - #if SIMDE_MATH_BUILTIN_LIBM(signbit) - #if (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) - #define simde_math_signbit(x) __builtin_signbit(x) - #else - #define simde_math_signbit(x) __builtin_signbit(HEDLEY_STATIC_CAST(double, (x))) - #endif - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_signbit(x) std::signbit(x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_signbit(x) signbit(x) - #endif -#endif - -#if !defined(simde_math_cos) - #if SIMDE_MATH_BUILTIN_LIBM(cos) - #define simde_math_cos(v) __builtin_cos(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cos(v) std::cos(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cos(v) cos(v) - #endif -#endif - -#if !defined(simde_math_cosf) - #if defined(SIMDE_MATH_SLEEF_ENABLE) - #if SIMDE_ACCURACY_PREFERENCE < 1 - #define simde_math_cosf(v) Sleef_cosf_u35(v) - #else - #define simde_math_cosf(v) Sleef_cosf_u10(v) - #endif - #elif SIMDE_MATH_BUILTIN_LIBM(cosf) - #define simde_math_cosf(v) __builtin_cosf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cosf(v) std::cos(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cosf(v) cosf(v) - #endif -#endif - -#if !defined(simde_math_cosh) - #if SIMDE_MATH_BUILTIN_LIBM(cosh) - #define simde_math_cosh(v) __builtin_cosh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cosh(v) std::cosh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cosh(v) cosh(v) - #endif -#endif - -#if !defined(simde_math_coshf) - #if SIMDE_MATH_BUILTIN_LIBM(coshf) - #define simde_math_coshf(v) __builtin_coshf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_coshf(v) std::cosh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_coshf(v) coshf(v) - #endif -#endif - -#if !defined(simde_math_erf) - #if SIMDE_MATH_BUILTIN_LIBM(erf) - #define simde_math_erf(v) __builtin_erf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_erf(v) std::erf(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_erf(v) erf(v) - #endif -#endif - -#if !defined(simde_math_erff) - #if SIMDE_MATH_BUILTIN_LIBM(erff) - #define simde_math_erff(v) __builtin_erff(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_erff(v) std::erf(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_erff(v) erff(v) - #endif -#endif - -#if !defined(simde_math_erfc) - #if SIMDE_MATH_BUILTIN_LIBM(erfc) - #define simde_math_erfc(v) __builtin_erfc(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_erfc(v) std::erfc(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_erfc(v) erfc(v) - #endif -#endif - -#if !defined(simde_math_erfcf) - #if SIMDE_MATH_BUILTIN_LIBM(erfcf) - #define simde_math_erfcf(v) __builtin_erfcf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_erfcf(v) std::erfc(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_erfcf(v) erfcf(v) - #endif -#endif - -#if !defined(simde_math_exp) - #if SIMDE_MATH_BUILTIN_LIBM(exp) - #define simde_math_exp(v) __builtin_exp(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_exp(v) std::exp(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_exp(v) exp(v) - #endif -#endif - -#if !defined(simde_math_expf) - #if SIMDE_MATH_BUILTIN_LIBM(expf) - #define simde_math_expf(v) __builtin_expf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_expf(v) std::exp(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_expf(v) expf(v) - #endif -#endif - -#if !defined(simde_math_expm1) - #if SIMDE_MATH_BUILTIN_LIBM(expm1) - #define simde_math_expm1(v) __builtin_expm1(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_expm1(v) std::expm1(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_expm1(v) expm1(v) - #endif -#endif - -#if !defined(simde_math_expm1f) - #if SIMDE_MATH_BUILTIN_LIBM(expm1f) - #define simde_math_expm1f(v) __builtin_expm1f(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_expm1f(v) std::expm1(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_expm1f(v) expm1f(v) - #endif -#endif - -#if !defined(simde_math_exp2) - #if SIMDE_MATH_BUILTIN_LIBM(exp2) - #define simde_math_exp2(v) __builtin_exp2(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_exp2(v) std::exp2(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_exp2(v) exp2(v) - #endif -#endif - -#if !defined(simde_math_exp2f) - #if SIMDE_MATH_BUILTIN_LIBM(exp2f) - #define simde_math_exp2f(v) __builtin_exp2f(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_exp2f(v) std::exp2(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_exp2f(v) exp2f(v) - #endif -#endif - -#if HEDLEY_HAS_BUILTIN(__builtin_exp10) || HEDLEY_GCC_VERSION_CHECK(3,4,0) - # define simde_math_exp10(v) __builtin_exp10(v) -#else -# define simde_math_exp10(v) pow(10.0, (v)) -#endif - -#if HEDLEY_HAS_BUILTIN(__builtin_exp10f) || HEDLEY_GCC_VERSION_CHECK(3,4,0) - # define simde_math_exp10f(v) __builtin_exp10f(v) -#else -# define simde_math_exp10f(v) powf(10.0f, (v)) -#endif - -#if !defined(simde_math_fabs) - #if SIMDE_MATH_BUILTIN_LIBM(fabs) - #define simde_math_fabs(v) __builtin_fabs(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fabs(v) std::fabs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fabs(v) fabs(v) - #endif -#endif - -#if !defined(simde_math_fabsf) - #if SIMDE_MATH_BUILTIN_LIBM(fabsf) - #define simde_math_fabsf(v) __builtin_fabsf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fabsf(v) std::fabs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fabsf(v) fabsf(v) - #endif -#endif - -#if !defined(simde_math_floor) - #if SIMDE_MATH_BUILTIN_LIBM(floor) - #define simde_math_floor(v) __builtin_floor(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_floor(v) std::floor(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_floor(v) floor(v) - #endif -#endif - -#if !defined(simde_math_floorf) - #if SIMDE_MATH_BUILTIN_LIBM(floorf) - #define simde_math_floorf(v) __builtin_floorf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_floorf(v) std::floor(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_floorf(v) floorf(v) - #endif -#endif - -#if !defined(simde_math_fma) - #if SIMDE_MATH_BUILTIN_LIBM(fma) - #define simde_math_fma(x, y, z) __builtin_fma(x, y, z) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fma(x, y, z) std::fma(x, y, z) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fma(x, y, z) fma(x, y, z) - #endif -#endif - -#if !defined(simde_math_fmaf) - #if SIMDE_MATH_BUILTIN_LIBM(fmaf) - #define simde_math_fmaf(x, y, z) __builtin_fmaf(x, y, z) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fmaf(x, y, z) std::fma(x, y, z) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fmaf(x, y, z) fmaf(x, y, z) - #endif -#endif - -#if !defined(simde_math_fmax) - #if SIMDE_MATH_BUILTIN_LIBM(fmax) - #define simde_math_fmax(x, y) __builtin_fmax(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fmax(x, y) std::fmax(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fmax(x, y) fmax(x, y) - #endif -#endif - -#if !defined(simde_math_fmaxf) - #if SIMDE_MATH_BUILTIN_LIBM(fmaxf) - #define simde_math_fmaxf(x, y) __builtin_fmaxf(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fmaxf(x, y) std::fmax(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fmaxf(x, y) fmaxf(x, y) - #endif -#endif - -#if !defined(simde_math_hypot) - #if SIMDE_MATH_BUILTIN_LIBM(hypot) - #define simde_math_hypot(y, x) __builtin_hypot(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_hypot(y, x) std::hypot(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_hypot(y, x) hypot(y, x) - #endif -#endif - -#if !defined(simde_math_hypotf) - #if SIMDE_MATH_BUILTIN_LIBM(hypotf) - #define simde_math_hypotf(y, x) __builtin_hypotf(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_hypotf(y, x) std::hypot(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_hypotf(y, x) hypotf(y, x) - #endif -#endif - -#if !defined(simde_math_log) - #if SIMDE_MATH_BUILTIN_LIBM(log) - #define simde_math_log(v) __builtin_log(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log(v) std::log(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log(v) log(v) - #endif -#endif - -#if !defined(simde_math_logf) - #if SIMDE_MATH_BUILTIN_LIBM(logf) - #define simde_math_logf(v) __builtin_logf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_logf(v) std::log(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_logf(v) logf(v) - #endif -#endif - -#if !defined(simde_math_logb) - #if SIMDE_MATH_BUILTIN_LIBM(logb) - #define simde_math_logb(v) __builtin_logb(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_logb(v) std::logb(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_logb(v) logb(v) - #endif -#endif - -#if !defined(simde_math_logbf) - #if SIMDE_MATH_BUILTIN_LIBM(logbf) - #define simde_math_logbf(v) __builtin_logbf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_logbf(v) std::logb(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_logbf(v) logbf(v) - #endif -#endif - -#if !defined(simde_math_log1p) - #if SIMDE_MATH_BUILTIN_LIBM(log1p) - #define simde_math_log1p(v) __builtin_log1p(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log1p(v) std::log1p(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log1p(v) log1p(v) - #endif -#endif - -#if !defined(simde_math_log1pf) - #if SIMDE_MATH_BUILTIN_LIBM(log1pf) - #define simde_math_log1pf(v) __builtin_log1pf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log1pf(v) std::log1p(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log1pf(v) log1pf(v) - #endif -#endif - -#if !defined(simde_math_log2) - #if SIMDE_MATH_BUILTIN_LIBM(log2) - #define simde_math_log2(v) __builtin_log2(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log2(v) std::log2(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log2(v) log2(v) - #endif -#endif - -#if !defined(simde_math_log2f) - #if SIMDE_MATH_BUILTIN_LIBM(log2f) - #define simde_math_log2f(v) __builtin_log2f(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log2f(v) std::log2(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log2f(v) log2f(v) - #endif -#endif - -#if !defined(simde_math_log10) - #if SIMDE_MATH_BUILTIN_LIBM(log10) - #define simde_math_log10(v) __builtin_log10(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log10(v) std::log10(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log10(v) log10(v) - #endif -#endif - -#if !defined(simde_math_log10f) - #if SIMDE_MATH_BUILTIN_LIBM(log10f) - #define simde_math_log10f(v) __builtin_log10f(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log10f(v) std::log10(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log10f(v) log10f(v) - #endif -#endif - -#if !defined(simde_math_modf) - #if SIMDE_MATH_BUILTIN_LIBM(modf) - #define simde_math_modf(x, iptr) __builtin_modf(x, iptr) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_modf(x, iptr) std::modf(x, iptr) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_modf(x, iptr) modf(x, iptr) - #endif -#endif - -#if !defined(simde_math_modff) - #if SIMDE_MATH_BUILTIN_LIBM(modff) - #define simde_math_modff(x, iptr) __builtin_modff(x, iptr) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_modff(x, iptr) std::modf(x, iptr) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_modff(x, iptr) modff(x, iptr) - #endif -#endif - -#if !defined(simde_math_nearbyint) - #if SIMDE_MATH_BUILTIN_LIBM(nearbyint) - #define simde_math_nearbyint(v) __builtin_nearbyint(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_nearbyint(v) std::nearbyint(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_nearbyint(v) nearbyint(v) - #endif -#endif - -#if !defined(simde_math_nearbyintf) - #if SIMDE_MATH_BUILTIN_LIBM(nearbyintf) - #define simde_math_nearbyintf(v) __builtin_nearbyintf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_nearbyintf(v) std::nearbyint(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_nearbyintf(v) nearbyintf(v) - #endif -#endif - -#if !defined(simde_math_pow) - #if SIMDE_MATH_BUILTIN_LIBM(pow) - #define simde_math_pow(y, x) __builtin_pow(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_pow(y, x) std::pow(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_pow(y, x) pow(y, x) - #endif -#endif - -#if !defined(simde_math_powf) - #if SIMDE_MATH_BUILTIN_LIBM(powf) - #define simde_math_powf(y, x) __builtin_powf(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_powf(y, x) std::pow(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_powf(y, x) powf(y, x) - #endif -#endif - -#if !defined(simde_math_rint) - #if SIMDE_MATH_BUILTIN_LIBM(rint) - #define simde_math_rint(v) __builtin_rint(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_rint(v) std::rint(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_rint(v) rint(v) - #endif -#endif - -#if !defined(simde_math_rintf) - #if SIMDE_MATH_BUILTIN_LIBM(rintf) - #define simde_math_rintf(v) __builtin_rintf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_rintf(v) std::rint(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_rintf(v) rintf(v) - #endif -#endif - -#if !defined(simde_math_round) - #if SIMDE_MATH_BUILTIN_LIBM(round) - #define simde_math_round(v) __builtin_round(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_round(v) std::round(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_round(v) round(v) - #endif -#endif - -#if !defined(simde_math_roundf) - #if SIMDE_MATH_BUILTIN_LIBM(roundf) - #define simde_math_roundf(v) __builtin_roundf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_roundf(v) std::round(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_roundf(v) roundf(v) - #endif -#endif - -#if !defined(simde_math_roundeven) - #if \ - (!defined(HEDLEY_EMSCRIPTEN_VERSION) && HEDLEY_HAS_BUILTIN(__builtin_roundeven)) || \ - HEDLEY_GCC_VERSION_CHECK(10,0,0) - #define simde_math_roundeven(v) __builtin_roundeven(v) - #elif defined(simde_math_round) && defined(simde_math_fabs) - static HEDLEY_INLINE - double - simde_math_roundeven(double v) { - double rounded = simde_math_round(v); - double diff = rounded - v; - if (HEDLEY_UNLIKELY(simde_math_fabs(diff) == 0.5) && (HEDLEY_STATIC_CAST(int64_t, rounded) & 1)) { - rounded = v - diff; - } - return rounded; - } - #define simde_math_roundeven simde_math_roundeven - #endif -#endif - -#if !defined(simde_math_roundevenf) - #if \ - (!defined(HEDLEY_EMSCRIPTEN_VERSION) && HEDLEY_HAS_BUILTIN(__builtin_roundevenf)) || \ - HEDLEY_GCC_VERSION_CHECK(10,0,0) - #define simde_math_roundevenf(v) __builtin_roundevenf(v) - #elif defined(simde_math_roundf) && defined(simde_math_fabsf) - static HEDLEY_INLINE - float - simde_math_roundevenf(float v) { - float rounded = simde_math_roundf(v); - float diff = rounded - v; - if (HEDLEY_UNLIKELY(simde_math_fabsf(diff) == 0.5f) && (HEDLEY_STATIC_CAST(int32_t, rounded) & 1)) { - rounded = v - diff; - } - return rounded; - } - #define simde_math_roundevenf simde_math_roundevenf - #endif -#endif - -#if !defined(simde_math_sin) - #if SIMDE_MATH_BUILTIN_LIBM(sin) - #define simde_math_sin(v) __builtin_sin(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sin(v) std::sin(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sin(v) sin(v) - #endif -#endif - -#if !defined(simde_math_sinf) - #if SIMDE_MATH_BUILTIN_LIBM(sinf) - #define simde_math_sinf(v) __builtin_sinf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sinf(v) std::sin(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sinf(v) sinf(v) - #endif -#endif - -#if !defined(simde_math_sinh) - #if SIMDE_MATH_BUILTIN_LIBM(sinh) - #define simde_math_sinh(v) __builtin_sinh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sinh(v) std::sinh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sinh(v) sinh(v) - #endif -#endif - -#if !defined(simde_math_sinhf) - #if SIMDE_MATH_BUILTIN_LIBM(sinhf) - #define simde_math_sinhf(v) __builtin_sinhf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sinhf(v) std::sinh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sinhf(v) sinhf(v) - #endif -#endif - -#if !defined(simde_math_sqrt) - #if SIMDE_MATH_BUILTIN_LIBM(sqrt) - #define simde_math_sqrt(v) __builtin_sqrt(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sqrt(v) std::sqrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sqrt(v) sqrt(v) - #endif -#endif - -#if !defined(simde_math_sqrtf) - #if SIMDE_MATH_BUILTIN_LIBM(sqrtf) - #define simde_math_sqrtf(v) __builtin_sqrtf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sqrtf(v) std::sqrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sqrtf(v) sqrtf(v) - #endif -#endif - -#if !defined(simde_math_sqrtl) - #if SIMDE_MATH_BUILTIN_LIBM(sqrtl) - #define simde_math_sqrtl(v) __builtin_sqrtl(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sqrtl(v) std::sqrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sqrtl(v) sqrtl(v) - #endif -#endif - -#if !defined(simde_math_tan) - #if SIMDE_MATH_BUILTIN_LIBM(tan) - #define simde_math_tan(v) __builtin_tan(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_tan(v) std::tan(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_tan(v) tan(v) - #endif -#endif - -#if !defined(simde_math_tanf) - #if SIMDE_MATH_BUILTIN_LIBM(tanf) - #define simde_math_tanf(v) __builtin_tanf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_tanf(v) std::tan(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_tanf(v) tanf(v) - #endif -#endif - -#if !defined(simde_math_tanh) - #if SIMDE_MATH_BUILTIN_LIBM(tanh) - #define simde_math_tanh(v) __builtin_tanh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_tanh(v) std::tanh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_tanh(v) tanh(v) - #endif -#endif - -#if !defined(simde_math_tanhf) - #if SIMDE_MATH_BUILTIN_LIBM(tanhf) - #define simde_math_tanhf(v) __builtin_tanhf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_tanhf(v) std::tanh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_tanhf(v) tanhf(v) - #endif -#endif - -#if !defined(simde_math_trunc) - #if SIMDE_MATH_BUILTIN_LIBM(trunc) - #define simde_math_trunc(v) __builtin_trunc(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_trunc(v) std::trunc(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_trunc(v) trunc(v) - #endif -#endif - -#if !defined(simde_math_truncf) - #if SIMDE_MATH_BUILTIN_LIBM(truncf) - #define simde_math_truncf(v) __builtin_truncf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_truncf(v) std::trunc(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_truncf(v) truncf(v) - #endif -#endif - -/*** Comparison macros (which don't raise invalid errors) ***/ - -#if defined(isunordered) - #define simde_math_isunordered(x, y) isunordered(x, y) -#elif HEDLEY_HAS_BUILTIN(__builtin_isunordered) - #define simde_math_isunordered(x, y) __builtin_isunordered(x, y) -#else - static HEDLEY_INLINE - int simde_math_isunordered(double x, double y) { - return (x != y) && (x != x || y != y); - } - #define simde_math_isunordered simde_math_isunordered - - static HEDLEY_INLINE - int simde_math_isunorderedf(float x, float y) { - return (x != y) && (x != x || y != y); - } - #define simde_math_isunorderedf simde_math_isunorderedf -#endif -#if !defined(simde_math_isunorderedf) - #define simde_math_isunorderedf simde_math_isunordered -#endif - -/*** Additional functions not in libm ***/ - -#if defined(simde_math_fabs) && defined(simde_math_sqrt) && defined(simde_math_exp) - static HEDLEY_INLINE - double - simde_math_cdfnorm(double x) { - /* https://www.johndcook.com/blog/cpp_phi/ - * Public Domain */ - static const double a1 = 0.254829592; - static const double a2 = -0.284496736; - static const double a3 = 1.421413741; - static const double a4 = -1.453152027; - static const double a5 = 1.061405429; - static const double p = 0.3275911; - - const int sign = x < 0; - x = simde_math_fabs(x) / simde_math_sqrt(2.0); - - /* A&S formula 7.1.26 */ - double t = 1.0 / (1.0 + p * x); - double y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * simde_math_exp(-x * x); - - return 0.5 * (1.0 + (sign ? -y : y)); - } - #define simde_math_cdfnorm simde_math_cdfnorm -#endif - -#if defined(simde_math_fabsf) && defined(simde_math_sqrtf) && defined(simde_math_expf) - static HEDLEY_INLINE - float - simde_math_cdfnormf(float x) { - /* https://www.johndcook.com/blog/cpp_phi/ - * Public Domain */ - static const float a1 = 0.254829592f; - static const float a2 = -0.284496736f; - static const float a3 = 1.421413741f; - static const float a4 = -1.453152027f; - static const float a5 = 1.061405429f; - static const float p = 0.3275911f; - - const int sign = x < 0; - x = simde_math_fabsf(x) / simde_math_sqrtf(2.0f); - - /* A&S formula 7.1.26 */ - float t = 1.0f / (1.0f + p * x); - float y = 1.0f - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * simde_math_expf(-x * x); - - return 0.5f * (1.0f + (sign ? -y : y)); - } - #define simde_math_cdfnormf simde_math_cdfnormf -#endif - -#if !defined(simde_math_cdfnorminv) && defined(simde_math_log) && defined(simde_math_sqrt) - /*https://web.archive.org/web/20150910081113/http://home.online.no/~pjacklam/notes/invnorm/impl/sprouse/ltqnorm.c*/ - static HEDLEY_INLINE - double - simde_math_cdfnorminv(double p) { - static const double a[6] = { - -3.969683028665376e+01, - 2.209460984245205e+02, - -2.759285104469687e+02, - 1.383577518672690e+02, - -3.066479806614716e+01, - 2.506628277459239e+00 - }; - - static const double b[5] = { - -5.447609879822406e+01, - 1.615858368580409e+02, - -1.556989798598866e+02, - 6.680131188771972e+01, - -1.328068155288572e+01 - }; - - static const double c[6] = { - -7.784894002430293e-03, - -3.223964580411365e-01, - -2.400758277161838e+00, - -2.549732539343734e+00, - 4.374664141464968e+00, - 2.938163982698783e+00 - }; - - static const double d[4] = { - 7.784695709041462e-03, - 3.224671290700398e-01, - 2.445134137142996e+00, - 3.754408661907416e+00 - }; - - static const double low = 0.02425; - static const double high = 0.97575; - double q, r; - - if (p < 0 || p > 1) { - return 0.0; - } else if (p == 0) { - return -SIMDE_MATH_INFINITY; - } else if (p == 1) { - return SIMDE_MATH_INFINITY; - } else if (p < low) { - q = simde_math_sqrt(-2.0 * simde_math_log(p)); - return - (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / - (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)); - } else if (p > high) { - q = simde_math_sqrt(-2.0 * simde_math_log(1.0 - p)); - return - -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / - (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)); - } else { - q = p - 0.5; - r = q * q; - return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * - q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1); - } -} -#define simde_math_cdfnorminv simde_math_cdfnorminv -#endif - -#if !defined(simde_math_cdfnorminvf) && defined(simde_math_logf) && defined(simde_math_sqrtf) - static HEDLEY_INLINE - float - simde_math_cdfnorminvf(float p) { - static const float a[6] = { - -3.969683028665376e+01f, - 2.209460984245205e+02f, - -2.759285104469687e+02f, - 1.383577518672690e+02f, - -3.066479806614716e+01f, - 2.506628277459239e+00f - }; - static const float b[5] = { - -5.447609879822406e+01f, - 1.615858368580409e+02f, - -1.556989798598866e+02f, - 6.680131188771972e+01f, - -1.328068155288572e+01f - }; - static const float c[6] = { - -7.784894002430293e-03f, - -3.223964580411365e-01f, - -2.400758277161838e+00f, - -2.549732539343734e+00f, - 4.374664141464968e+00f, - 2.938163982698783e+00f - }; - static const float d[4] = { - 7.784695709041462e-03f, - 3.224671290700398e-01f, - 2.445134137142996e+00f, - 3.754408661907416e+00f - }; - static const float low = 0.02425f; - static const float high = 0.97575f; - float q, r; - - if (p < 0 || p > 1) { - return 0.0f; - } else if (p == 0) { - return -SIMDE_MATH_INFINITYF; - } else if (p == 1) { - return SIMDE_MATH_INFINITYF; - } else if (p < low) { - q = simde_math_sqrtf(-2.0f * simde_math_logf(p)); - return - (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / - (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)); - } else if (p > high) { - q = simde_math_sqrtf(-2.0f * simde_math_logf(1.0f - p)); - return - -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / - (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)); - } else { - q = p - 0.5f; - r = q * q; - return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * - q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1); - } - } - #define simde_math_cdfnorminvf simde_math_cdfnorminvf -#endif - -#if !defined(simde_math_erfinv) && defined(simde_math_log) && defined(simde_math_copysign) && defined(simde_math_sqrt) - static HEDLEY_INLINE - double - simde_math_erfinv(double x) { - /* https://stackoverflow.com/questions/27229371/inverse-error-function-in-c - * - * The original answer on SO uses a constant of 0.147, but in my - * testing 0.14829094707965850830078125 gives a lower average absolute error - * (0.0001410958211636170744895935 vs. 0.0001465479290345683693885803). - * That said, if your goal is to minimize the *maximum* absolute - * error, 0.15449436008930206298828125 provides significantly better - * results; 0.0009250640869140625000000000 vs ~ 0.005. */ - double tt1, tt2, lnx; - double sgn = simde_math_copysign(1.0, x); - - x = (1.0 - x) * (1.0 + x); - lnx = simde_math_log(x); - - tt1 = 2.0 / (SIMDE_MATH_PI * 0.14829094707965850830078125) + 0.5 * lnx; - tt2 = (1.0 / 0.14829094707965850830078125) * lnx; - - return sgn * simde_math_sqrt(-tt1 + simde_math_sqrt(tt1 * tt1 - tt2)); - } - #define simde_math_erfinv simde_math_erfinv -#endif - -#if !defined(simde_math_erfinvf) && defined(simde_math_logf) && defined(simde_math_copysignf) && defined(simde_math_sqrtf) - static HEDLEY_INLINE - float - simde_math_erfinvf(float x) { - float tt1, tt2, lnx; - float sgn = simde_math_copysignf(1.0f, x); - - x = (1.0f - x) * (1.0f + x); - lnx = simde_math_logf(x); - - tt1 = 2.0f / (SIMDE_MATH_PIF * 0.14829094707965850830078125f) + 0.5f * lnx; - tt2 = (1.0f / 0.14829094707965850830078125f) * lnx; - - return sgn * simde_math_sqrtf(-tt1 + simde_math_sqrtf(tt1 * tt1 - tt2)); - } - #define simde_math_erfinvf simde_math_erfinvf -#endif - -#if !defined(simde_math_erfcinv) && defined(simde_math_erfinv) && defined(simde_math_log) && defined(simde_math_sqrt) - static HEDLEY_INLINE - double - simde_math_erfcinv(double x) { - if(x >= 0.0625 && x < 2.0) { - return simde_math_erfinv(1.0 - x); - } else if (x < 0.0625 && x >= 1.0e-100) { - static const double p[6] = { - 0.1550470003116, - 1.382719649631, - 0.690969348887, - -1.128081391617, - 0.680544246825, - -0.16444156791 - }; - static const double q[3] = { - 0.155024849822, - 1.385228141995, - 1.000000000000 - }; - - const double t = 1.0 / simde_math_sqrt(-simde_math_log(x)); - return (p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) / - (q[0] + t * (q[1] + t * (q[2]))); - } else if (x < 1.0e-100 && x >= SIMDE_MATH_DBL_MIN) { - static const double p[4] = { - 0.00980456202915, - 0.363667889171, - 0.97302949837, - -0.5374947401 - }; - static const double q[3] = { - 0.00980451277802, - 0.363699971544, - 1.000000000000 - }; - - const double t = 1.0 / simde_math_sqrt(-simde_math_log(x)); - return (p[0] / t + p[1] + t * (p[2] + t * p[3])) / - (q[0] + t * (q[1] + t * (q[2]))); - } else if (!simde_math_isnormal(x)) { - return SIMDE_MATH_INFINITY; - } else { - return -SIMDE_MATH_INFINITY; - } - } - - #define simde_math_erfcinv simde_math_erfcinv -#endif - -#if !defined(simde_math_erfcinvf) && defined(simde_math_erfinvf) && defined(simde_math_logf) && defined(simde_math_sqrtf) - static HEDLEY_INLINE - float - simde_math_erfcinvf(float x) { - if(x >= 0.0625f && x < 2.0f) { - return simde_math_erfinvf(1.0f - x); - } else if (x < 0.0625f && x >= SIMDE_MATH_FLT_MIN) { - static const float p[6] = { - 0.1550470003116f, - 1.382719649631f, - 0.690969348887f, - -1.128081391617f, - 0.680544246825f - -0.164441567910f - }; - static const float q[3] = { - 0.155024849822f, - 1.385228141995f, - 1.000000000000f - }; - - const float t = 1.0f / simde_math_sqrtf(-simde_math_logf(x)); - return (p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) / - (q[0] + t * (q[1] + t * (q[2]))); - } else if (x < SIMDE_MATH_FLT_MIN && simde_math_isnormalf(x)) { - static const float p[4] = { - 0.00980456202915f, - 0.36366788917100f, - 0.97302949837000f, - -0.5374947401000f - }; - static const float q[3] = { - 0.00980451277802f, - 0.36369997154400f, - 1.00000000000000f - }; - - const float t = 1.0f / simde_math_sqrtf(-simde_math_logf(x)); - return (p[0] / t + p[1] + t * (p[2] + t * p[3])) / - (q[0] + t * (q[1] + t * (q[2]))); - } else { - return simde_math_isnormalf(x) ? -SIMDE_MATH_INFINITYF : SIMDE_MATH_INFINITYF; - } - } - - #define simde_math_erfcinvf simde_math_erfcinvf -#endif - -static HEDLEY_INLINE -double -simde_math_rad2deg(double radians) { - return radians * SIMDE_MATH_180_OVER_PI; -} - -static HEDLEY_INLINE -float -simde_math_rad2degf(float radians) { - return radians * SIMDE_MATH_180_OVER_PIF; -} - -static HEDLEY_INLINE -double -simde_math_deg2rad(double degrees) { - return degrees * SIMDE_MATH_PI_OVER_180; -} - -static HEDLEY_INLINE -float -simde_math_deg2radf(float degrees) { - return degrees * (SIMDE_MATH_PI_OVER_180F); -} - -/*** Saturated arithmetic ***/ - -static HEDLEY_INLINE -int8_t -simde_math_adds_i8(int8_t a, int8_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddb_s8(a, b); - #else - uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a); - uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b); - uint8_t r_ = a_ + b_; - - a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT8_MAX; - if (HEDLEY_STATIC_CAST(int8_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int8_t, r_); - #endif -} - -static HEDLEY_INLINE -int16_t -simde_math_adds_i16(int16_t a, int16_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddh_s16(a, b); - #else - uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a); - uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b); - uint16_t r_ = a_ + b_; - - a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT16_MAX; - if (HEDLEY_STATIC_CAST(int16_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int16_t, r_); - #endif -} - -static HEDLEY_INLINE -int32_t -simde_math_adds_i32(int32_t a, int32_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqadds_s32(a, b); - #else - uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a); - uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b); - uint32_t r_ = a_ + b_; - - a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT32_MAX; - if (HEDLEY_STATIC_CAST(int32_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int32_t, r_); - #endif -} - -static HEDLEY_INLINE -int64_t -simde_math_adds_i64(int64_t a, int64_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddd_s64(a, b); - #else - uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a); - uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b); - uint64_t r_ = a_ + b_; - - a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT64_MAX; - if (HEDLEY_STATIC_CAST(int64_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int64_t, r_); - #endif -} - -static HEDLEY_INLINE -uint8_t -simde_math_adds_u8(uint8_t a, uint8_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddb_u8(a, b); - #else - uint8_t r = a + b; - r |= -(r < a); - return r; - #endif -} - -static HEDLEY_INLINE -uint16_t -simde_math_adds_u16(uint16_t a, uint16_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddh_u16(a, b); - #else - uint16_t r = a + b; - r |= -(r < a); - return r; - #endif -} - -static HEDLEY_INLINE -uint32_t -simde_math_adds_u32(uint32_t a, uint32_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqadds_u32(a, b); - #else - uint32_t r = a + b; - r |= -(r < a); - return r; - #endif -} - -static HEDLEY_INLINE -uint64_t -simde_math_adds_u64(uint64_t a, uint64_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddd_u64(a, b); - #else - uint64_t r = a + b; - r |= -(r < a); - return r; - #endif -} - -static HEDLEY_INLINE -int8_t -simde_math_subs_i8(int8_t a, int8_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubb_s8(a, b); - #else - uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a); - uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b); - uint8_t r_ = a_ - b_; - - a_ = (a_ >> 7) + INT8_MAX; - - if (HEDLEY_STATIC_CAST(int8_t, (a_ ^ b_) & (a_ ^ r_)) < 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int8_t, r_); - #endif -} - -static HEDLEY_INLINE -int16_t -simde_math_subs_i16(int16_t a, int16_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubh_s16(a, b); - #else - uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a); - uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b); - uint16_t r_ = a_ - b_; - - a_ = (a_ >> 15) + INT16_MAX; - - if (HEDLEY_STATIC_CAST(int16_t, (a_ ^ b_) & (a_ ^ r_)) < 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int16_t, r_); - #endif -} - -static HEDLEY_INLINE -int32_t -simde_math_subs_i32(int32_t a, int32_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubs_s32(a, b); - #else - uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a); - uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b); - uint32_t r_ = a_ - b_; - - a_ = (a_ >> 31) + INT32_MAX; - - if (HEDLEY_STATIC_CAST(int32_t, (a_ ^ b_) & (a_ ^ r_)) < 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int32_t, r_); - #endif -} - -static HEDLEY_INLINE -int64_t -simde_math_subs_i64(int64_t a, int64_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubd_s64(a, b); - #else - uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a); - uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b); - uint64_t r_ = a_ - b_; - - a_ = (a_ >> 63) + INT64_MAX; - - if (HEDLEY_STATIC_CAST(int64_t, (a_ ^ b_) & (a_ ^ r_)) < 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int64_t, r_); - #endif -} - -static HEDLEY_INLINE -uint8_t -simde_math_subs_u8(uint8_t a, uint8_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubb_u8(a, b); - #else - uint8_t res = a - b; - res &= -(res <= a); - return res; - #endif -} - -static HEDLEY_INLINE -uint16_t -simde_math_subs_u16(uint16_t a, uint16_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubh_u16(a, b); - #else - uint16_t res = a - b; - res &= -(res <= a); - return res; - #endif -} - -static HEDLEY_INLINE -uint32_t -simde_math_subs_u32(uint32_t a, uint32_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubs_u32(a, b); - #else - uint32_t res = a - b; - res &= -(res <= a); - return res; - #endif -} - -static HEDLEY_INLINE -uint64_t -simde_math_subs_u64(uint64_t a, uint64_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubd_u64(a, b); - #else - uint64_t res = a - b; - res &= -(res <= a); - return res; - #endif -} - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_MATH_H) */ diff --git a/extern/simde/x86/aes.h b/extern/simde/x86/aes.h deleted file mode 100644 index 1d5b04926..000000000 --- a/extern/simde/x86/aes.h +++ /dev/null @@ -1,417 +0,0 @@ -/* MIT License - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#if !defined(SIMDE_X86_AES_H) -#define SIMDE_X86_AES_H - -/* - * Advanced Encryption Standard - * @author Dani Huertas - * @email huertas.dani@gmail.com - * - * Based on the document FIPS PUB 197 - */ - -#include "sse2.h" - -/* - * Multiplication in GF(2^8) - * http://en.wikipedia.org/wiki/Finite_field_arithmetic - * Irreducible polynomial m(x) = x8 + x4 + x3 + x + 1 - * - * NOTE: This function can be easily replaced with a look up table for a speed - * boost, at the expense of an increase in memory size. - -SIMDE_FUNCTION_ATTRIBUTES -uint8_t gmult(uint8_t a, uint8_t b) { - uint8_t p = 0, i = 0, hbs = 0; - - for (i = 0; i < 8; i++) { - if (b & 1) { - p ^= a; - } - - hbs = a & 0x80; - a <<= 1; - if (hbs) a ^= 0x1b; // 0000 0001 0001 1011 - b >>= 1; - } - - return (uint8_t)p; -} - */ - -#if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) - -#include "../simde-aes.h" - -/* - * Transformation in the Cipher and Inverse Cipher in which a Round - * Key is added to the State using an XOR operation. The length of a - * Round Key equals the size of the State (i.e., for Nb = 4, the Round - * Key length equals 128 bits/16 bytes). - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_add_round_key(uint8_t *state, simde__m128i_private w, uint8_t r) { - - int Nb = simde_x_aes_Nb; - uint8_t c; - - for (c = 0; c < Nb; c++) { - state[Nb*0+c] = state[Nb*0+c]^w.u8[4*Nb*r+4*c+0]; - state[Nb*1+c] = state[Nb*1+c]^w.u8[4*Nb*r+4*c+1]; - state[Nb*2+c] = state[Nb*2+c]^w.u8[4*Nb*r+4*c+2]; - state[Nb*3+c] = state[Nb*3+c]^w.u8[4*Nb*r+4*c+3]; - } -} - -/* - * Transformation in the Cipher that takes all of the columns of the - * State and mixes their data (independently of one another) to - * produce new columns. - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_mix_columns(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - // uint8_t k[] = {0x02, 0x01, 0x01, 0x03}; // a(x) = {02} + {01}x + {01}x2 + {03}x3 - uint8_t i, j, col[4], res[4]; - - for (j = 0; j < Nb; j++) { - for (i = 0; i < 4; i++) { - col[i] = state[Nb*i+j]; - } - - //coef_mult(k, col, res); - simde_x_aes_coef_mult_lookup(0, col, res); - - for (i = 0; i < 4; i++) { - state[Nb*i+j] = res[i]; - } - } -} - -/* - * Transformation in the Inverse Cipher that is the inverse of - * MixColumns(). - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_inv_mix_columns(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 - uint8_t i, j, col[4], res[4]; - - for (j = 0; j < Nb; j++) { - for (i = 0; i < 4; i++) { - col[i] = state[Nb*i+j]; - } - - //coef_mult(k, col, res); - simde_x_aes_coef_mult_lookup(4, col, res); - - for (i = 0; i < 4; i++) { - state[Nb*i+j] = res[i]; - } - } -} - -/* - * Transformation in the Cipher that processes the State by cyclically - * shifting the last three rows of the State by different offsets. - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_shift_rows(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - uint8_t i, k, s, tmp; - - for (i = 1; i < 4; i++) { - // shift(1,4)=1; shift(2,4)=2; shift(3,4)=3 - // shift(r, 4) = r; - s = 0; - while (s < i) { - tmp = state[Nb*i+0]; - - for (k = 1; k < Nb; k++) { - state[Nb*i+k-1] = state[Nb*i+k]; - } - - state[Nb*i+Nb-1] = tmp; - s++; - } - } -} - -/* - * Transformation in the Inverse Cipher that is the inverse of - * ShiftRows(). - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_inv_shift_rows(uint8_t *state) { - - uint8_t Nb = simde_x_aes_Nb; - uint8_t i, k, s, tmp; - - for (i = 1; i < 4; i++) { - s = 0; - while (s < i) { - tmp = state[Nb*i+Nb-1]; - - for (k = Nb-1; k > 0; k--) { - state[Nb*i+k] = state[Nb*i+k-1]; - } - - state[Nb*i+0] = tmp; - s++; - } - } -} - -/* - * Transformation in the Cipher that processes the State using a non - * linear byte substitution table (S-box) that operates on each of the - * State bytes independently. - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_sub_bytes(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - uint8_t i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - // s_box row: yyyy ---- - // s_box col: ---- xxxx - // s_box[16*(yyyy) + xxxx] == s_box[yyyyxxxx] - state[Nb*i+j] = simde_x_aes_s_box[state[Nb*i+j]]; - } - } -} - -/* - * Transformation in the Inverse Cipher that is the inverse of - * SubBytes(). - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_inv_sub_bytes(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - uint8_t i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - state[Nb*i+j] = simde_x_aes_inv_s_box[state[Nb*i+j]]; - } - } -} - -/* - * Performs the AES cipher operation - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_enc(simde__m128i_private in, simde__m128i_private *out, simde__m128i_private w, int is_last) { - - int Nb = simde_x_aes_Nb; - uint8_t state[4*simde_x_aes_Nb]; - uint8_t r = 0, i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - state[Nb*i+j] = in.u8[i+4*j]; - } - } - - simde_x_aes_sub_bytes(state); - simde_x_aes_shift_rows(state); - - if (!is_last) - simde_x_aes_mix_columns(state); - - simde_x_aes_add_round_key(state, w, r); - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - out->u8[i+4*j] = state[Nb*i+j]; - } - } -} - -/* - * Performs the AES inverse cipher operation - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_dec(simde__m128i_private in, simde__m128i_private *out, simde__m128i_private w, int is_last) { - - int Nb = simde_x_aes_Nb; - uint8_t state[4*simde_x_aes_Nb]; - uint8_t r = 0, i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - state[Nb*i+j] = in.u8[i+4*j]; - } - } - - simde_x_aes_inv_shift_rows(state); - simde_x_aes_inv_sub_bytes(state); - - if (!is_last) - simde_x_aes_inv_mix_columns(state); - - simde_x_aes_add_round_key(state, w, r); - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - out->u8[i+4*j] = state[Nb*i+j]; - } - } -} -#endif // if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesenc_si128(simde__m128i a, simde__m128i round_key) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesenc_si128(a, round_key); - #else - simde__m128i_private result_; - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128i_private round_key_ = simde__m128i_to_private(round_key); - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = veorq_u8( - vaesmcq_u8(vaeseq_u8(a_.neon_u8, vdupq_n_u8(0))), - round_key_.neon_u8); - #else - simde_x_aes_enc(a_, &result_, round_key_, 0); - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesenc_si128(a, b) simde_mm_aesenc_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesdec_si128(simde__m128i a, simde__m128i round_key) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesdec_si128(a, round_key); - #else - simde__m128i_private result_; - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128i_private round_key_ = simde__m128i_to_private(round_key); - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = veorq_u8( - vaesimcq_u8(vaesdq_u8(a_.neon_u8, vdupq_n_u8(0))), - round_key_.neon_u8); - #else - simde_x_aes_dec(a_, &result_, round_key_, 0); - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesdec_si128(a, b) simde_mm_aesdec_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesenclast_si128(simde__m128i a, simde__m128i round_key) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesenclast_si128(a, round_key); - #else - simde__m128i_private result_; - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128i_private round_key_ = simde__m128i_to_private(round_key); - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = vaeseq_u8(a_.neon_u8, vdupq_n_u8(0)); - result_.neon_i32 = veorq_s32(result_.neon_i32, round_key_.neon_i32); // _mm_xor_si128 - #else - simde_x_aes_enc(a_, &result_, round_key_, 1); - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesenclast_si128(a, b) simde_mm_aesenclast_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesdeclast_si128(simde__m128i a, simde__m128i round_key) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesdeclast_si128(a, round_key); - #else - simde__m128i_private result_; - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128i_private round_key_ = simde__m128i_to_private(round_key); - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = veorq_u8( - vaesdq_u8(a_.neon_u8, vdupq_n_u8(0)), - round_key_.neon_u8); - #else - simde_x_aes_dec(a_, &result_, round_key_, 1); - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesdeclast_si128(a, b) simde_mm_aesdeclast_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesimc_si128(simde__m128i a) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesimc_si128(a); - #else - simde__m128i_private result_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = vaesimcq_u8(a_.neon_u8); - #else - int Nb = simde_x_aes_Nb; - // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 - uint8_t i, j, col[4], res[4]; - - for (j = 0; j < Nb; j++) { - for (i = 0; i < 4; i++) { - col[i] = a_.u8[Nb*j+i]; - } - - //coef_mult(k, col, res); - simde_x_aes_coef_mult_lookup(4, col, res); - - for (i = 0; i < 4; i++) { - result_.u8[Nb*j+i] = res[i]; - } - } - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesimc_si128(a) simde_mm_aesimc_si128(a) -#endif - -#undef simde_x_aes_Nb - -#endif /* !defined(SIMDE_X86_AES_H) */ diff --git a/extern/simde/x86/avx.h b/extern/simde/x86/avx.h deleted file mode 100644 index 2314f9556..000000000 --- a/extern/simde/x86/avx.h +++ /dev/null @@ -1,6267 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2018-2020 Evan Nemerson - * 2020 Michael R. Crusoe - */ - -#include "sse.h" -#if !defined(SIMDE_X86_AVX_H) -#define SIMDE_X86_AVX_H - -#include "sse4.2.h" -#include "../simde-f16.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_32 int8_t i8[32]; - SIMDE_ALIGN_TO_32 int16_t i16[16]; - SIMDE_ALIGN_TO_32 int32_t i32[8]; - SIMDE_ALIGN_TO_32 int64_t i64[4]; - SIMDE_ALIGN_TO_32 uint8_t u8[32]; - SIMDE_ALIGN_TO_32 uint16_t u16[16]; - SIMDE_ALIGN_TO_32 uint32_t u32[8]; - SIMDE_ALIGN_TO_32 uint64_t u64[4]; - SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128[2]; - SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32[8]; - SIMDE_ALIGN_TO_32 simde_float64 f64[4]; - #endif - - SIMDE_ALIGN_TO_32 simde__m128_private m128_private[2]; - SIMDE_ALIGN_TO_32 simde__m128 m128[2]; - - #if defined(SIMDE_X86_AVX_NATIVE) - SIMDE_ALIGN_TO_32 __m256 n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(int) altivec_i32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(long long) altivec_i64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; - #endif - #endif -} simde__m256_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_32 int8_t i8[32]; - SIMDE_ALIGN_TO_32 int16_t i16[16]; - SIMDE_ALIGN_TO_32 int32_t i32[8]; - SIMDE_ALIGN_TO_32 int64_t i64[4]; - SIMDE_ALIGN_TO_32 uint8_t u8[32]; - SIMDE_ALIGN_TO_32 uint16_t u16[16]; - SIMDE_ALIGN_TO_32 uint32_t u32[8]; - SIMDE_ALIGN_TO_32 uint64_t u64[4]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128[2]; - SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32[8]; - SIMDE_ALIGN_TO_32 simde_float64 f64[4]; - SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_ALIGN_TO_32 simde__m128d_private m128d_private[2]; - SIMDE_ALIGN_TO_32 simde__m128d m128d[2]; - - #if defined(SIMDE_X86_AVX_NATIVE) - SIMDE_ALIGN_TO_32 __m256d n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; - #endif - #endif -} simde__m256d_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #endif - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_ALIGN_TO_32 simde_float16 f16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_32 simde_float16 f16[16]; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_32 int8_t i8[32]; - SIMDE_ALIGN_TO_32 int16_t i16[16]; - SIMDE_ALIGN_TO_32 int32_t i32[8]; - SIMDE_ALIGN_TO_32 int64_t i64[4]; - SIMDE_ALIGN_TO_32 uint8_t u8[32]; - SIMDE_ALIGN_TO_32 uint16_t u16[16]; - SIMDE_ALIGN_TO_32 uint32_t u32[8]; - SIMDE_ALIGN_TO_32 uint64_t u64[4]; - SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128[2]; - SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; - #endif - SIMDE_ALIGN_TO_32 simde_float16 f16[16]; - SIMDE_ALIGN_TO_32 simde_float32 f32[8]; - SIMDE_ALIGN_TO_32 simde_float64 f64[4]; - #endif - - SIMDE_ALIGN_TO_32 simde__m128i_private m128i_private[2]; - SIMDE_ALIGN_TO_32 simde__m128i m128i[2]; - - #if defined(SIMDE_X86_AVX_NATIVE) - SIMDE_ALIGN_TO_32 __m256i n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; - #endif - #endif -} simde__m256i_private; - -#if defined(SIMDE_X86_AVX_NATIVE) - typedef __m256 simde__m256; - typedef __m256i simde__m256i; - typedef __m256d simde__m256d; -#elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef simde_float32 simde__m256 SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - typedef int_fast32_t simde__m256i SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - typedef simde_float64 simde__m256d SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; -#else - typedef simde__m256_private simde__m256; - typedef simde__m256i_private simde__m256i; - typedef simde__m256d_private simde__m256d; -#endif - -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #if !defined(HEDLEY_INTEL_VERSION) && !defined(_AVXINTRIN_H_INCLUDED) && !defined(__AVXINTRIN_H) && !defined(_CMP_EQ_OQ) - typedef simde__m256 __m256; - typedef simde__m256i __m256i; - typedef simde__m256d __m256d; - #else - #undef __m256 - #define __m256 simde__m256 - #undef __m256i - #define __m256i simde__m256i - #undef __m256d - #define __m256d simde__m256d - #endif -#endif - -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256), "simde__m256 size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256_private), "simde__m256_private size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256i), "simde__m256i size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256i_private), "simde__m256i_private size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256d), "simde__m256d size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256d_private), "simde__m256d_private size incorrect"); -#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256) == 32, "simde__m256 is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256_private) == 32, "simde__m256_private is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i) == 32, "simde__m256i is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i_private) == 32, "simde__m256i_private is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d) == 32, "simde__m256d is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d_private) == 32, "simde__m256d_private is not 32-byte aligned"); -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde__m256_from_private(simde__m256_private v) { - simde__m256 r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256_private -simde__m256_to_private(simde__m256 v) { - simde__m256_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde__m256i_from_private(simde__m256i_private v) { - simde__m256i r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i_private -simde__m256i_to_private(simde__m256i v) { - simde__m256i_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde__m256d_from_private(simde__m256d_private v) { - simde__m256d r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d_private -simde__m256d_to_private(simde__m256d v) { - simde__m256d_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -#define SIMDE_CMP_EQ_OQ 0 -#define SIMDE_CMP_LT_OS 1 -#define SIMDE_CMP_LE_OS 2 -#define SIMDE_CMP_UNORD_Q 3 -#define SIMDE_CMP_NEQ_UQ 4 -#define SIMDE_CMP_NLT_US 5 -#define SIMDE_CMP_NLE_US 6 -#define SIMDE_CMP_ORD_Q 7 -#define SIMDE_CMP_EQ_UQ 8 -#define SIMDE_CMP_NGE_US 9 -#define SIMDE_CMP_NGT_US 10 -#define SIMDE_CMP_FALSE_OQ 11 -#define SIMDE_CMP_NEQ_OQ 12 -#define SIMDE_CMP_GE_OS 13 -#define SIMDE_CMP_GT_OS 14 -#define SIMDE_CMP_TRUE_UQ 15 -#define SIMDE_CMP_EQ_OS 16 -#define SIMDE_CMP_LT_OQ 17 -#define SIMDE_CMP_LE_OQ 18 -#define SIMDE_CMP_UNORD_S 19 -#define SIMDE_CMP_NEQ_US 20 -#define SIMDE_CMP_NLT_UQ 21 -#define SIMDE_CMP_NLE_UQ 22 -#define SIMDE_CMP_ORD_S 23 -#define SIMDE_CMP_EQ_US 24 -#define SIMDE_CMP_NGE_UQ 25 -#define SIMDE_CMP_NGT_UQ 26 -#define SIMDE_CMP_FALSE_OS 27 -#define SIMDE_CMP_NEQ_OS 28 -#define SIMDE_CMP_GE_OQ 29 -#define SIMDE_CMP_GT_OQ 30 -#define SIMDE_CMP_TRUE_US 31 - -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) && !defined(_CMP_EQ_OQ) -#define _CMP_EQ_OQ SIMDE_CMP_EQ_OQ -#define _CMP_LT_OS SIMDE_CMP_LT_OS -#define _CMP_LE_OS SIMDE_CMP_LE_OS -#define _CMP_UNORD_Q SIMDE_CMP_UNORD_Q -#define _CMP_NEQ_UQ SIMDE_CMP_NEQ_UQ -#define _CMP_NLT_US SIMDE_CMP_NLT_US -#define _CMP_NLE_US SIMDE_CMP_NLE_US -#define _CMP_ORD_Q SIMDE_CMP_ORD_Q -#define _CMP_EQ_UQ SIMDE_CMP_EQ_UQ -#define _CMP_NGE_US SIMDE_CMP_NGE_US -#define _CMP_NGT_US SIMDE_CMP_NGT_US -#define _CMP_FALSE_OQ SIMDE_CMP_FALSE_OQ -#define _CMP_NEQ_OQ SIMDE_CMP_NEQ_OQ -#define _CMP_GE_OS SIMDE_CMP_GE_OS -#define _CMP_GT_OS SIMDE_CMP_GT_OS -#define _CMP_TRUE_UQ SIMDE_CMP_TRUE_UQ -#define _CMP_EQ_OS SIMDE_CMP_EQ_OS -#define _CMP_LT_OQ SIMDE_CMP_LT_OQ -#define _CMP_LE_OQ SIMDE_CMP_LE_OQ -#define _CMP_UNORD_S SIMDE_CMP_UNORD_S -#define _CMP_NEQ_US SIMDE_CMP_NEQ_US -#define _CMP_NLT_UQ SIMDE_CMP_NLT_UQ -#define _CMP_NLE_UQ SIMDE_CMP_NLE_UQ -#define _CMP_ORD_S SIMDE_CMP_ORD_S -#define _CMP_EQ_US SIMDE_CMP_EQ_US -#define _CMP_NGE_UQ SIMDE_CMP_NGE_UQ -#define _CMP_NGT_UQ SIMDE_CMP_NGT_UQ -#define _CMP_FALSE_OS SIMDE_CMP_FALSE_OS -#define _CMP_NEQ_OS SIMDE_CMP_NEQ_OS -#define _CMP_GE_OQ SIMDE_CMP_GE_OQ -#define _CMP_GT_OQ SIMDE_CMP_GT_OQ -#define _CMP_TRUE_US SIMDE_CMP_TRUE_US -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_castps_pd (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castps_pd(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castps_pd - #define _mm256_castps_pd(a) simde_mm256_castps_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_castps_si256 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castps_si256(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castps_si256 - #define _mm256_castps_si256(a) simde_mm256_castps_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_castsi256_pd (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castsi256_pd(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castsi256_pd - #define _mm256_castsi256_pd(a) simde_mm256_castsi256_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_castsi256_ps (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castsi256_ps(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castsi256_ps - #define _mm256_castsi256_ps(a) simde_mm256_castsi256_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_castpd_ps (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castpd_ps(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castpd_ps - #define _mm256_castpd_ps(a) simde_mm256_castpd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_castpd_si256 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castpd_si256(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castpd_si256 - #define _mm256_castpd_si256(a) simde_mm256_castpd_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setzero_si256 (void) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setzero_si256(); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_setzero_si128(); - r_.m128i[1] = simde_mm_setzero_si128(); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = 0; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setzero_si256 - #define _mm256_setzero_si256() simde_mm256_setzero_si256() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_setzero_ps (void) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setzero_ps(); - #else - return simde_mm256_castsi256_ps(simde_mm256_setzero_si256()); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setzero_ps - #define _mm256_setzero_ps() simde_mm256_setzero_ps() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_setzero_pd (void) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setzero_pd(); - #else - return simde_mm256_castsi256_pd(simde_mm256_setzero_si256()); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setzero_pd - #define _mm256_setzero_pd() simde_mm256_setzero_pd() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_not_ps(simde__m256 a) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = ~a_.i32; - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - r_.m128[0] = simde_x_mm_not_ps(a_.m128[0]); - r_.m128[1] = simde_x_mm_not_ps(a_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ~(a_.i32[i]); - } - #endif - - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_select_ps(simde__m256 a, simde__m256 b, simde__m256 mask) { - /* This function is for when you want to blend two elements together - * according to a mask. It is similar to _mm256_blendv_ps, except that - * it is undefined whether the blend is based on the highest bit in - * each lane (like blendv) or just bitwise operations. This allows - * us to implement the function efficiently everywhere. - * - * Basically, you promise that all the lanes in mask are either 0 or - * ~0. */ - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_blendv_ps(a, b, mask); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - mask_ = simde__m256_to_private(mask); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - r_.m128[0] = simde_x_mm_select_ps(a_.m128[0], b_.m128[0], mask_.m128[0]); - r_.m128[1] = simde_x_mm_select_ps(a_.m128[1], b_.m128[1], mask_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] ^ ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_not_pd(simde__m256d a) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = ~a_.i64; - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - r_.m128d[0] = simde_x_mm_not_pd(a_.m128d[0]); - r_.m128d[1] = simde_x_mm_not_pd(a_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ~(a_.i64[i]); - } - #endif - - return simde__m256d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_select_pd(simde__m256d a, simde__m256d b, simde__m256d mask) { - /* This function is for when you want to blend two elements together - * according to a mask. It is similar to _mm256_blendv_pd, except that - * it is undefined whether the blend is based on the highest bit in - * each lane (like blendv) or just bitwise operations. This allows - * us to implement the function efficiently everywhere. - * - * Basically, you promise that all the lanes in mask are either 0 or - * ~0. */ - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_blendv_pd(a, b, mask); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - mask_ = simde__m256d_to_private(mask); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - r_.m128d[0] = simde_x_mm_select_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]); - r_.m128d[1] = simde_x_mm_select_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_setone_si256 (void) { - simde__m256i_private r_; - -#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - __typeof__(r_.i32f) rv = { 0, }; - r_.i32f = ~rv; -#elif defined(SIMDE_X86_AVX2_NATIVE) - __m256i t = _mm256_setzero_si256(); - r_.n = _mm256_cmpeq_epi32(t, t); -#else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); - } -#endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_setone_ps (void) { - return simde_mm256_castsi256_ps(simde_x_mm256_setone_si256()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_setone_pd (void) { - return simde_mm256_castsi256_pd(simde_x_mm256_setone_si256()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_epi8 (int8_t e31, int8_t e30, int8_t e29, int8_t e28, - int8_t e27, int8_t e26, int8_t e25, int8_t e24, - int8_t e23, int8_t e22, int8_t e21, int8_t e20, - int8_t e19, int8_t e18, int8_t e17, int8_t e16, - int8_t e15, int8_t e14, int8_t e13, int8_t e12, - int8_t e11, int8_t e10, int8_t e9, int8_t e8, - int8_t e7, int8_t e6, int8_t e5, int8_t e4, - int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, - e23, e22, e21, e20, e19, e18, e17, e16, - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi8( - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - r_.m128i[1] = simde_mm_set_epi8( - e31, e30, e29, e28, e27, e26, e25, e24, - e23, e22, e21, e20, e19, e18, e17, e16); - #else - r_.i8[ 0] = e0; - r_.i8[ 1] = e1; - r_.i8[ 2] = e2; - r_.i8[ 3] = e3; - r_.i8[ 4] = e4; - r_.i8[ 5] = e5; - r_.i8[ 6] = e6; - r_.i8[ 7] = e7; - r_.i8[ 8] = e8; - r_.i8[ 9] = e9; - r_.i8[10] = e10; - r_.i8[11] = e11; - r_.i8[12] = e12; - r_.i8[13] = e13; - r_.i8[14] = e14; - r_.i8[15] = e15; - r_.i8[16] = e16; - r_.i8[17] = e17; - r_.i8[18] = e18; - r_.i8[19] = e19; - r_.i8[20] = e20; - r_.i8[21] = e21; - r_.i8[22] = e22; - r_.i8[23] = e23; - r_.i8[24] = e24; - r_.i8[25] = e25; - r_.i8[26] = e26; - r_.i8[27] = e27; - r_.i8[28] = e28; - r_.i8[29] = e29; - r_.i8[30] = e30; - r_.i8[31] = e31; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_epi8 - #define _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_epi16 (int16_t e15, int16_t e14, int16_t e13, int16_t e12, - int16_t e11, int16_t e10, int16_t e9, int16_t e8, - int16_t e7, int16_t e6, int16_t e5, int16_t e4, - int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi16( e7, e6, e5, e4, e3, e2, e1, e0); - r_.m128i[1] = simde_mm_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8); - #else - r_.i16[ 0] = e0; - r_.i16[ 1] = e1; - r_.i16[ 2] = e2; - r_.i16[ 3] = e3; - r_.i16[ 4] = e4; - r_.i16[ 5] = e5; - r_.i16[ 6] = e6; - r_.i16[ 7] = e7; - r_.i16[ 8] = e8; - r_.i16[ 9] = e9; - r_.i16[10] = e10; - r_.i16[11] = e11; - r_.i16[12] = e12; - r_.i16[13] = e13; - r_.i16[14] = e14; - r_.i16[15] = e15; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_epi16 - #define _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_epi32 (int32_t e7, int32_t e6, int32_t e5, int32_t e4, - int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi32(e3, e2, e1, e0); - r_.m128i[1] = simde_mm_set_epi32(e7, e6, e5, e4); - #else - r_.i32[ 0] = e0; - r_.i32[ 1] = e1; - r_.i32[ 2] = e2; - r_.i32[ 3] = e3; - r_.i32[ 4] = e4; - r_.i32[ 5] = e5; - r_.i32[ 6] = e6; - r_.i32[ 7] = e7; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_epi32 - #define _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi64x(e3, e2, e1, e0); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi64x(e1, e0); - r_.m128i[1] = simde_mm_set_epi64x(e3, e2); - #else - r_.i64[0] = e0; - r_.i64[1] = e1; - r_.i64[2] = e2; - r_.i64[3] = e3; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_epi64x - #define _mm256_set_epi64x(e3, e2, e1, e0) simde_mm256_set_epi64x(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_set_epu8 (uint8_t e31, uint8_t e30, uint8_t e29, uint8_t e28, - uint8_t e27, uint8_t e26, uint8_t e25, uint8_t e24, - uint8_t e23, uint8_t e22, uint8_t e21, uint8_t e20, - uint8_t e19, uint8_t e18, uint8_t e17, uint8_t e16, - uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, - uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8, - uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, - uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { - simde__m256i_private r_; - - r_.u8[ 0] = e0; - r_.u8[ 1] = e1; - r_.u8[ 2] = e2; - r_.u8[ 3] = e3; - r_.u8[ 4] = e4; - r_.u8[ 5] = e5; - r_.u8[ 6] = e6; - r_.u8[ 7] = e7; - r_.u8[ 8] = e8; - r_.u8[ 9] = e9; - r_.u8[10] = e10; - r_.u8[11] = e11; - r_.u8[12] = e12; - r_.u8[13] = e13; - r_.u8[14] = e14; - r_.u8[15] = e15; - r_.u8[16] = e16; - r_.u8[17] = e17; - r_.u8[18] = e18; - r_.u8[19] = e19; - r_.u8[20] = e20; - r_.u8[20] = e20; - r_.u8[21] = e21; - r_.u8[22] = e22; - r_.u8[23] = e23; - r_.u8[24] = e24; - r_.u8[25] = e25; - r_.u8[26] = e26; - r_.u8[27] = e27; - r_.u8[28] = e28; - r_.u8[29] = e29; - r_.u8[30] = e30; - r_.u8[31] = e31; - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_set_epu16 (uint16_t e15, uint16_t e14, uint16_t e13, uint16_t e12, - uint16_t e11, uint16_t e10, uint16_t e9, uint16_t e8, - uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, - uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { - simde__m256i_private r_; - - r_.u16[ 0] = e0; - r_.u16[ 1] = e1; - r_.u16[ 2] = e2; - r_.u16[ 3] = e3; - r_.u16[ 4] = e4; - r_.u16[ 5] = e5; - r_.u16[ 6] = e6; - r_.u16[ 7] = e7; - r_.u16[ 8] = e8; - r_.u16[ 9] = e9; - r_.u16[10] = e10; - r_.u16[11] = e11; - r_.u16[12] = e12; - r_.u16[13] = e13; - r_.u16[14] = e14; - r_.u16[15] = e15; - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_set_epu32 (uint32_t e7, uint32_t e6, uint32_t e5, uint32_t e4, - uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4), - HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0)); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0)); - r_.m128i[1] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4)); - #else - r_.u32[ 0] = e0; - r_.u32[ 1] = e1; - r_.u32[ 2] = e2; - r_.u32[ 3] = e3; - r_.u32[ 4] = e4; - r_.u32[ 5] = e5; - r_.u32[ 6] = e6; - r_.u32[ 7] = e7; - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_set_epu64x (uint64_t e3, uint64_t e2, uint64_t e1, uint64_t e0) { - simde__m256i_private r_; - - r_.u64[0] = e0; - r_.u64[1] = e1; - r_.u64[2] = e2; - r_.u64[3] = e3; - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_set_ps (simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4, - simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m256_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_set_ps(e3, e2, e1, e0); - r_.m128[1] = simde_mm_set_ps(e7, e6, e5, e4); - #else - r_.f32[0] = e0; - r_.f32[1] = e1; - r_.f32[2] = e2; - r_.f32[3] = e3; - r_.f32[4] = e4; - r_.f32[5] = e5; - r_.f32[6] = e6; - r_.f32[7] = e7; - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_ps - #define _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_set_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_pd(e3, e2, e1, e0); - #else - simde__m256d_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_set_pd(e1, e0); - r_.m128d[1] = simde_mm_set_pd(e3, e2); - #else - r_.f64[0] = e0; - r_.f64[1] = e1; - r_.f64[2] = e2; - r_.f64[3] = e3; - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_pd - #define _mm256_set_pd(e3, e2, e1, e0) \ - simde_mm256_set_pd(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_set_m128 (simde__m128 e1, simde__m128 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_ps(_mm256_castps128_ps256(e0), e1, 1); - #else - simde__m256_private r_; - simde__m128_private - e1_ = simde__m128_to_private(e1), - e0_ = simde__m128_to_private(e0); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128_private[0] = e0_; - r_.m128_private[1] = e1_; - #elif defined(SIMDE_HAVE_INT128_) - r_.i128[0] = e0_.i128[0]; - r_.i128[1] = e1_.i128[0]; - #else - r_.i64[0] = e0_.i64[0]; - r_.i64[1] = e0_.i64[1]; - r_.i64[2] = e1_.i64[0]; - r_.i64[3] = e1_.i64[1]; - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_m128 - #define _mm256_set_m128(e1, e0) simde_mm256_set_m128(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_set_m128d (simde__m128d e1, simde__m128d e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_pd(_mm256_castpd128_pd256(e0), e1, 1); - #else - simde__m256d_private r_; - simde__m128d_private - e1_ = simde__m128d_to_private(e1), - e0_ = simde__m128d_to_private(e0); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d_private[0] = e0_; - r_.m128d_private[1] = e1_; - #else - r_.i64[0] = e0_.i64[0]; - r_.i64[1] = e0_.i64[1]; - r_.i64[2] = e1_.i64[0]; - r_.i64[3] = e1_.i64[1]; - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_m128d - #define _mm256_set_m128d(e1, e0) simde_mm256_set_m128d(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_m128i (simde__m128i e1, simde__m128i e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_si256(_mm256_castsi128_si256(e0), e1, 1); - #else - simde__m256i_private r_; - simde__m128i_private - e1_ = simde__m128i_to_private(e1), - e0_ = simde__m128i_to_private(e0); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i_private[0] = e0_; - r_.m128i_private[1] = e1_; - #else - r_.i64[0] = e0_.i64[0]; - r_.i64[1] = e0_.i64[1]; - r_.i64[2] = e1_.i64[0]; - r_.i64[3] = e1_.i64[1]; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_m128i - #define _mm256_set_m128i(e1, e0) simde_mm256_set_m128i(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set1_epi8 (int8_t a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_epi8(a); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set1_epi8(a); - r_.m128i[1] = simde_mm_set1_epi8(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_epi8 - #define _mm256_set1_epi8(a) simde_mm256_set1_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set1_epi16 (int16_t a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_epi16(a); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set1_epi16(a); - r_.m128i[1] = simde_mm_set1_epi16(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_epi16 - #define _mm256_set1_epi16(a) simde_mm256_set1_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set1_epi32 (int32_t a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_epi32(a); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set1_epi32(a); - r_.m128i[1] = simde_mm_set1_epi32(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_epi32 - #define _mm256_set1_epi32(a) simde_mm256_set1_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set1_epi64x (int64_t a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_epi64x(a); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set1_epi64x(a); - r_.m128i[1] = simde_mm_set1_epi64x(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_epi64x - #define _mm256_set1_epi64x(a) simde_mm256_set1_epi64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_set1_ps (simde_float32 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_ps(a); - #else - simde__m256_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_set1_ps(a); - r_.m128[1] = simde_mm_set1_ps(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_ps - #define _mm256_set1_ps(a) simde_mm256_set1_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_set1_pd (simde_float64 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_pd(a); - #else - simde__m256d_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_set1_pd(a); - r_.m128d[1] = simde_mm_set1_pd(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_pd - #define _mm256_set1_pd(a) simde_mm256_set1_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_deinterleaveeven_epi16 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i16[i] = a_.i16[2 * i]; - r_.i16[i + quarter_point] = b_.i16[2 * i]; - r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i]; - r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_deinterleaveodd_epi16 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i16[i] = a_.i16[2 * i + 1]; - r_.i16[i + quarter_point] = b_.i16[2 * i + 1]; - r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i + 1]; - r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i + 1]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_deinterleaveeven_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 2, 8, 10, 4, 6, 12, 14); - #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i32[i] = a_.i32[2 * i]; - r_.i32[i + quarter_point] = b_.i32[2 * i]; - r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i]; - r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_deinterleaveodd_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 1, 3, 9, 11, 5, 7, 13, 15); - #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i32[i] = a_.i32[2 * i + 1]; - r_.i32[i + quarter_point] = b_.i32[2 * i + 1]; - r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i + 1]; - r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i + 1]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_deinterleaveeven_ps (simde__m256 a, simde__m256 b) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_x_mm_deinterleaveeven_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_x_mm_deinterleaveeven_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 2, 8, 10, 4, 6, 12, 14); - #else - const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2; - const size_t quarter_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.f32[i] = a_.f32[2 * i]; - r_.f32[i + quarter_point] = b_.f32[2 * i]; - r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i]; - r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i]; - } - #endif - - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_deinterleaveodd_ps (simde__m256 a, simde__m256 b) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_x_mm_deinterleaveodd_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_x_mm_deinterleaveodd_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 1, 3, 9, 11, 5, 7, 13, 15); - #else - const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2; - const size_t quarter_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.f32[i] = a_.f32[2 * i + 1]; - r_.f32[i + quarter_point] = b_.f32[2 * i + 1]; - r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i + 1]; - r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i + 1]; - } - #endif - - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_deinterleaveeven_pd (simde__m256d a, simde__m256d b) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_x_mm_deinterleaveeven_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_x_mm_deinterleaveeven_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6); - #else - const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2; - const size_t quarter_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.f64[i] = a_.f64[2 * i]; - r_.f64[i + quarter_point] = b_.f64[2 * i]; - r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i]; - r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i]; - } - #endif - - return simde__m256d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_deinterleaveodd_pd (simde__m256d a, simde__m256d b) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_x_mm_deinterleaveodd_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_x_mm_deinterleaveodd_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7); - #else - const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2; - const size_t quarter_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.f64[i] = a_.f64[2 * i + 1]; - r_.f64[i + quarter_point] = b_.f64[2 * i + 1]; - r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i + 1]; - r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i + 1]; - } - #endif - - return simde__m256d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_abs_ps(simde__m256 a) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_fabsf(a_.f32[i]); - } - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_abs_pd(simde__m256d a) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_fabs(a_.f64[i]); - } - return simde__m256d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_add_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_add_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_add_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_add_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 + b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] + b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_ps - #define _mm256_add_ps(a, b) simde_mm256_add_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_hadd_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hadd_ps(a, b); - #else - return simde_mm256_add_ps(simde_x_mm256_deinterleaveeven_ps(a, b), simde_x_mm256_deinterleaveodd_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadd_ps - #define _mm256_hadd_ps(a, b) simde_mm256_hadd_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_add_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_add_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_add_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_add_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 + b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] + b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_pd - #define _mm256_add_pd(a, b) simde_mm256_add_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_hadd_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hadd_pd(a, b); - #else - return simde_mm256_add_pd(simde_x_mm256_deinterleaveeven_pd(a, b), simde_x_mm256_deinterleaveodd_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadd_pd - #define _mm256_hadd_pd(a, b) simde_mm256_hadd_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_addsub_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_addsub_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_addsub_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_addsub_ps(a_.m128[1], b_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = a_.f32[ i ] - b_.f32[ i ]; - r_.f32[i + 1] = a_.f32[i + 1] + b_.f32[i + 1]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_addsub_ps - #define _mm256_addsub_ps(a, b) simde_mm256_addsub_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_addsub_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_addsub_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_addsub_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_addsub_pd(a_.m128d[1], b_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = a_.f64[ i ] - b_.f64[ i ]; - r_.f64[i + 1] = a_.f64[i + 1] + b_.f64[i + 1]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_addsub_pd - #define _mm256_addsub_pd(a, b) simde_mm256_addsub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_and_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_and_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_and_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_and_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_and_ps - #define _mm256_and_ps(a, b) simde_mm256_and_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_and_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_and_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_and_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_and_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_and_pd - #define _mm256_and_pd(a, b) simde_mm256_and_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_andnot_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_andnot_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_andnot_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_andnot_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_andnot_ps - #define _mm256_andnot_ps(a, b) simde_mm256_andnot_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_andnot_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_andnot_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_andnot_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_andnot_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_andnot_pd - #define _mm256_andnot_pd(a, b) simde_mm256_andnot_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_blend_ps (simde__m256 a, simde__m256 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_blend_ps(a, b, imm8) _mm256_blend_ps(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) -# define simde_mm256_blend_ps(a, b, imm8) \ - simde_mm256_set_m128( \ - simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8) >> 4), \ - simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), (imm8) & 0x0F)) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_blend_ps - #define _mm256_blend_ps(a, b, imm8) simde_mm256_blend_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_blend_pd (simde__m256d a, simde__m256d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; - } - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_blend_pd(a, b, imm8) _mm256_blend_pd(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) -# define simde_mm256_blend_pd(a, b, imm8) \ - simde_mm256_set_m128d( \ - simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8) >> 2), \ - simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8) & 3)) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_blend_pd - #define _mm256_blend_pd(a, b, imm8) simde_mm256_blend_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_blendv_ps (simde__m256 a, simde__m256 b, simde__m256 mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_blendv_ps(a, b, mask); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - mask_ = simde__m256_to_private(mask); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_blendv_ps(a_.m128[0], b_.m128[0], mask_.m128[0]); - r_.m128[1] = simde_mm_blendv_ps(a_.m128[1], b_.m128[1], mask_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.f32[i] = (mask_.u32[i] & (UINT32_C(1) << 31)) ? b_.f32[i] : a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_blendv_ps - #define _mm256_blendv_ps(a, b, imm8) simde_mm256_blendv_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_blendv_pd (simde__m256d a, simde__m256d b, simde__m256d mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_blendv_pd(a, b, mask); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - mask_ = simde__m256d_to_private(mask); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_blendv_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]); - r_.m128d[1] = simde_mm_blendv_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.f64[i] = (mask_.u64[i] & (UINT64_C(1) << 63)) ? b_.f64[i] : a_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_blendv_pd - #define _mm256_blendv_pd(a, b, imm8) simde_mm256_blendv_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_broadcast_pd (simde__m128d const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_broadcast_pd(mem_addr); - #else - simde__m256d_private r_; - - simde__m128d tmp = simde_mm_loadu_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, mem_addr)); - r_.m128d[0] = tmp; - r_.m128d[1] = tmp; - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_pd - #define _mm256_broadcast_pd(mem_addr) simde_mm256_broadcast_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_broadcast_ps (simde__m128 const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_broadcast_ps(mem_addr); - #else - simde__m256_private r_; - - simde__m128 tmp = simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(simde_float32 const*, mem_addr)); - r_.m128[0] = tmp; - r_.m128[1] = tmp; - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_ps - #define _mm256_broadcast_ps(mem_addr) simde_mm256_broadcast_ps(HEDLEY_REINTERPRET_CAST(simde__m128 const*, mem_addr)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_broadcast_sd (simde_float64 const * a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_broadcast_sd(a); - #else - return simde_mm256_set1_pd(*a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_sd - #define _mm256_broadcast_sd(mem_addr) simde_mm256_broadcast_sd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_broadcast_ss (simde_float32 const * a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_broadcast_ss(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128_from_wasm_v128(wasm_v128_load32_splat(a)); - #else - return simde_mm_set1_ps(*a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcast_ss - #define _mm_broadcast_ss(mem_addr) simde_mm_broadcast_ss(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_broadcast_ss (simde_float32 const * a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_broadcast_ss(a); - #else - return simde_mm256_set1_ps(*a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_ss - #define _mm256_broadcast_ss(mem_addr) simde_mm256_broadcast_ss(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_castpd128_pd256 (simde__m128d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castpd128_pd256(a); - #else - simde__m256d_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - r_.m128d_private[0] = a_; - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castpd128_pd256 - #define _mm256_castpd128_pd256(a) simde_mm256_castpd128_pd256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm256_castpd256_pd128 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castpd256_pd128(a); - #else - simde__m256d_private a_ = simde__m256d_to_private(a); - return a_.m128d[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castpd256_pd128 - #define _mm256_castpd256_pd128(a) simde_mm256_castpd256_pd128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_castps128_ps256 (simde__m128 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castps128_ps256(a); - #else - simde__m256_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - r_.m128_private[0] = a_; - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castps128_ps256 - #define _mm256_castps128_ps256(a) simde_mm256_castps128_ps256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_castps256_ps128 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castps256_ps128(a); - #else - simde__m256_private a_ = simde__m256_to_private(a); - return a_.m128[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castps256_ps128 - #define _mm256_castps256_ps128(a) simde_mm256_castps256_ps128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_castsi128_si256 (simde__m128i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castsi128_si256(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - r_.m128i_private[0] = a_; - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castsi128_si256 - #define _mm256_castsi128_si256(a) simde_mm256_castsi128_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_castsi256_si128 (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castsi256_si128(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.m128i[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castsi256_si128 - #define _mm256_castsi256_si128(a) simde_mm256_castsi256_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_round_ps (simde__m256 a, const int rounding) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyintf) - case SIMDE_MM_FROUND_CUR_DIRECTION: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_nearbyintf(a_.f32[i]); - } - break; - #endif - - #if defined(simde_math_roundf) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_roundf(a_.f32[i]); - } - break; - #endif - - #if defined(simde_math_floorf) - case SIMDE_MM_FROUND_TO_NEG_INF: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_floorf(a_.f32[i]); - } - break; - #endif - - #if defined(simde_math_ceilf) - case SIMDE_MM_FROUND_TO_POS_INF: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_ceilf(a_.f32[i]); - } - break; - #endif - - #if defined(simde_math_truncf) - case SIMDE_MM_FROUND_TO_ZERO: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_truncf(a_.f32[i]); - } - break; - #endif - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_ps()); - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_round_ps(a, rounding) _mm256_round_ps(a, rounding) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm256_round_ps(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256_private \ - simde_mm256_round_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ - simde_mm256_round_ps_a_ = simde__m256_to_private(a); \ - \ - for (size_t simde_mm256_round_ps_i = 0 ; simde_mm256_round_ps_i < (sizeof(simde_mm256_round_ps_r_.m128) / sizeof(simde_mm256_round_ps_r_.m128[0])) ; simde_mm256_round_ps_i++) { \ - simde_mm256_round_ps_r_.m128[simde_mm256_round_ps_i] = simde_mm_round_ps(simde_mm256_round_ps_a_.m128[simde_mm256_round_ps_i], rounding); \ - } \ - \ - simde__m256_from_private(simde_mm256_round_ps_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_round_ps - #define _mm256_round_ps(a, rounding) simde_mm256_round_ps(a, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_round_pd (simde__m256d a, const int rounding) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyint) - case SIMDE_MM_FROUND_CUR_DIRECTION: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_nearbyint(a_.f64[i]); - } - break; - #endif - - #if defined(simde_math_round) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_round(a_.f64[i]); - } - break; - #endif - - #if defined(simde_math_floor) - case SIMDE_MM_FROUND_TO_NEG_INF: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); - } - break; - #endif - - #if defined(simde_math_ceil) - case SIMDE_MM_FROUND_TO_POS_INF: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); - } - break; - #endif - - #if defined(simde_math_trunc) - case SIMDE_MM_FROUND_TO_ZERO: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); - } - break; - #endif - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_pd()); - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_round_pd(a, rounding) _mm256_round_pd(a, rounding) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm256_round_pd(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256d_private \ - simde_mm256_round_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ - simde_mm256_round_pd_a_ = simde__m256d_to_private(a); \ - \ - for (size_t simde_mm256_round_pd_i = 0 ; simde_mm256_round_pd_i < (sizeof(simde_mm256_round_pd_r_.m128d) / sizeof(simde_mm256_round_pd_r_.m128d[0])) ; simde_mm256_round_pd_i++) { \ - simde_mm256_round_pd_r_.m128d[simde_mm256_round_pd_i] = simde_mm_round_pd(simde_mm256_round_pd_a_.m128d[simde_mm256_round_pd_i], rounding); \ - } \ - \ - simde__m256d_from_private(simde_mm256_round_pd_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_round_pd - #define _mm256_round_pd(a, rounding) simde_mm256_round_pd(a, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_ceil_pd (simde__m256d a) { - return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_ceil_pd - #define _mm256_ceil_pd(a) simde_mm256_ceil_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_ceil_ps (simde__m256 a) { - return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_ceil_ps - #define _mm256_ceil_ps(a) simde_mm256_ceil_ps(a) -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL - -/* This implementation does not support signaling NaNs (yet?) */ -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmp_pd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - switch (imm8) { - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - return simde_mm_or_pd(simde_mm_cmpunord_pd(a, b), simde_mm_cmpeq_pd(a, b)); - break; - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - return simde_mm_cmpeq_pd(a, b); - break; - case SIMDE_CMP_NGE_US: - case SIMDE_CMP_NGE_UQ: - return simde_x_mm_not_pd(simde_mm_cmpge_pd(a, b)); - break; - case SIMDE_CMP_LT_OS: - case SIMDE_CMP_LT_OQ: - return simde_mm_cmplt_pd(a, b); - break; - case SIMDE_CMP_NGT_US: - case SIMDE_CMP_NGT_UQ: - return simde_x_mm_not_pd(simde_mm_cmpgt_pd(a, b)); - break; - case SIMDE_CMP_LE_OS: - case SIMDE_CMP_LE_OQ: - return simde_mm_cmple_pd(a, b); - break; - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - return simde_mm_cmpneq_pd(a, b); - break; - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - return simde_mm_and_pd(simde_mm_cmpord_pd(a, b), simde_mm_cmpneq_pd(a, b)); - break; - case SIMDE_CMP_NLT_US: - case SIMDE_CMP_NLT_UQ: - return simde_x_mm_not_pd(simde_mm_cmplt_pd(a, b)); - break; - case SIMDE_CMP_GE_OS: - case SIMDE_CMP_GE_OQ: - return simde_mm_cmpge_pd(a, b); - break; - case SIMDE_CMP_NLE_US: - case SIMDE_CMP_NLE_UQ: - return simde_x_mm_not_pd(simde_mm_cmple_pd(a, b)); - break; - case SIMDE_CMP_GT_OS: - case SIMDE_CMP_GT_OQ: - return simde_mm_cmpgt_pd(a, b); - break; - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - return simde_mm_setzero_pd(); - break; - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - return simde_x_mm_setone_pd(); - break; - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - return simde_mm_cmpunord_pd(a, b); - break; - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - return simde_mm_cmpord_pd(a, b); - break; - } - - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_pd()); -} -#if defined(__clang__) && defined(__AVX512DQ__) - #define simde_mm_cmp_pd(a, b, imm8) (__extension__ ({ \ - simde__m128d simde_mm_cmp_pd_r; \ - switch (imm8) { \ - case SIMDE_CMP_FALSE_OQ: \ - case SIMDE_CMP_FALSE_OS: \ - simde_mm_cmp_pd_r = simde_mm_setzero_pd(); \ - break; \ - case SIMDE_CMP_TRUE_UQ: \ - case SIMDE_CMP_TRUE_US: \ - simde_mm_cmp_pd_r = simde_x_mm_setone_pd(); \ - break; \ - default: \ - simde_mm_cmp_pd_r = simde_mm_cmp_pd(a, b, imm8); \ - break; \ - } \ - simde_mm_cmp_pd_r; \ - })) -#elif defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm_cmp_pd(a, b, imm8) _mm_cmp_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_pd - #define _mm_cmp_pd(a, b, imm8) simde_mm_cmp_pd(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmp_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - switch (imm8) { - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - return simde_mm_or_ps(simde_mm_cmpunord_ps(a, b), simde_mm_cmpeq_ps(a, b)); - break; - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - return simde_mm_cmpeq_ps(a, b); - break; - case SIMDE_CMP_NGE_US: - case SIMDE_CMP_NGE_UQ: - return simde_x_mm_not_ps(simde_mm_cmpge_ps(a, b)); - break; - case SIMDE_CMP_LT_OS: - case SIMDE_CMP_LT_OQ: - return simde_mm_cmplt_ps(a, b); - break; - case SIMDE_CMP_NGT_US: - case SIMDE_CMP_NGT_UQ: - return simde_x_mm_not_ps(simde_mm_cmpgt_ps(a, b)); - break; - case SIMDE_CMP_LE_OS: - case SIMDE_CMP_LE_OQ: - return simde_mm_cmple_ps(a, b); - break; - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - return simde_mm_cmpneq_ps(a, b); - break; - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - return simde_mm_and_ps(simde_mm_cmpord_ps(a, b), simde_mm_cmpneq_ps(a, b)); - break; - case SIMDE_CMP_NLT_US: - case SIMDE_CMP_NLT_UQ: - return simde_x_mm_not_ps(simde_mm_cmplt_ps(a, b)); - break; - case SIMDE_CMP_GE_OS: - case SIMDE_CMP_GE_OQ: - return simde_mm_cmpge_ps(a, b); - break; - case SIMDE_CMP_NLE_US: - case SIMDE_CMP_NLE_UQ: - return simde_x_mm_not_ps(simde_mm_cmple_ps(a, b)); - break; - case SIMDE_CMP_GT_OS: - case SIMDE_CMP_GT_OQ: - return simde_mm_cmpgt_ps(a, b); - break; - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - return simde_mm_setzero_ps(); - break; - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - return simde_x_mm_setone_ps(); - break; - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - return simde_mm_cmpunord_ps(a, b); - break; - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - return simde_mm_cmpord_ps(a, b); - break; - } - - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_ps()); -} -/* Prior to 9.0 clang has problems with _mm{,256}_cmp_{ps,pd} for all four of the true/false - * comparisons, but only when AVX-512 is enabled. */ -#if defined(__clang__) && defined(__AVX512DQ__) - #define simde_mm_cmp_ps(a, b, imm8) (__extension__ ({ \ - simde__m128 simde_mm_cmp_ps_r; \ - switch (imm8) { \ - case SIMDE_CMP_FALSE_OQ: \ - case SIMDE_CMP_FALSE_OS: \ - simde_mm_cmp_ps_r = simde_mm_setzero_ps(); \ - break; \ - case SIMDE_CMP_TRUE_UQ: \ - case SIMDE_CMP_TRUE_US: \ - simde_mm_cmp_ps_r = simde_x_mm_setone_ps(); \ - break; \ - default: \ - simde_mm_cmp_ps_r = simde_mm_cmp_ps(a, b, imm8); \ - break; \ - } \ - simde_mm_cmp_ps_r; \ - })) -#elif defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm_cmp_ps(a, b, imm8) _mm_cmp_ps(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_ps - #define _mm_cmp_ps(a, b, imm8) simde_mm_cmp_ps(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmp_sd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - a_.i64[0] = (a_.f64[0] == b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - a_.i64[0] = (a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - a_.i64[0] = (a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - a_.i64[0] = ((a_.f64[0] != a_.f64[0]) || (b_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - a_.i64[0] = !(a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - a_.i64[0] = !(a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - a_.i64[0] = ((a_.f64[0] != a_.f64[0]) | (b_.f64[0] != b_.f64[0]) | (a_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - a_.i64[0] = !(a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - a_.i64[0] = !(a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - a_.i64[0] = INT64_C(0); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - a_.i64[0] = (a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - a_.i64[0] = (a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - a_.i64[0] = ~INT64_C(0); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde__m128d_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm_cmp_sd(a, b, imm8) _mm_cmp_sd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_sd - #define _mm_cmp_sd(a, b, imm8) simde_mm_cmp_sd(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmp_ss (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - a_.i32[0] = (a_.f32[0] == b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - a_.i32[0] = (a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - a_.i32[0] = (a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - a_.i32[0] = ((a_.f32[0] != a_.f32[0]) || (b_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - a_.i32[0] = !(a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - a_.i32[0] = !(a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - a_.i32[0] = ((a_.f32[0] != a_.f32[0]) | (b_.f32[0] != b_.f32[0]) | (a_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - a_.i32[0] = !(a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - a_.i32[0] = !(a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - a_.i32[0] = INT32_C(0); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - a_.i32[0] = (a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - a_.i32[0] = (a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - a_.i32[0] = ~INT32_C(0); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde__m128_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm_cmp_ss(a, b, imm8) _mm_cmp_ss(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_ss - #define _mm_cmp_ss(a, b, imm8) simde_mm_cmp_ss(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m256d -#if defined(__clang__) && defined(__AVX512DQ__) -simde_mm256_cmp_pd_internal_ -#else -simde_mm256_cmp_pd -#endif -(simde__m256d a, simde__m256d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] == b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] < b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] <= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] != a_.f64[i]) || (b_.f64[i] != b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] != b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == a_.f64) & (b_.f64 == b_.f64) & (a_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i]) & (a_.f64[i] != b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 < b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] < b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 <= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] <= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ((a_.f64 == a_.f64) & (b_.f64 == b_.f64))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64) | (a_.f64 == b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] != a_.f64[i]) | (b_.f64[i] != b_.f64[i]) | (a_.f64[i] == b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 >= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] >= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 > b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] > b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - r_ = simde__m256d_to_private(simde_mm256_setzero_pd()); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] >= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] > b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - r_ = simde__m256d_to_private(simde_x_mm256_setone_pd()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde__m256d_from_private(r_); -} -#if defined(__clang__) && defined(__AVX512DQ__) - #define simde_mm256_cmp_pd(a, b, imm8) (__extension__ ({ \ - simde__m256d simde_mm256_cmp_pd_r; \ - switch (imm8) { \ - case SIMDE_CMP_FALSE_OQ: \ - case SIMDE_CMP_FALSE_OS: \ - simde_mm256_cmp_pd_r = simde_mm256_setzero_pd(); \ - break; \ - case SIMDE_CMP_TRUE_UQ: \ - case SIMDE_CMP_TRUE_US: \ - simde_mm256_cmp_pd_r = simde_x_mm256_setone_pd(); \ - break; \ - default: \ - simde_mm256_cmp_pd_r = simde_mm256_cmp_pd_internal_(a, b, imm8); \ - break; \ - } \ - simde_mm256_cmp_pd_r; \ - })) -#elif defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_cmp_pd(a, b, imm8) _mm256_cmp_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmp_pd - #define _mm256_cmp_pd(a, b, imm8) simde_mm256_cmp_pd(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m256 -#if defined(__clang__) && defined(__AVX512DQ__) -simde_mm256_cmp_ps_internal_ -#else -simde_mm256_cmp_ps -#endif -(simde__m256 a, simde__m256 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] == b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] < b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] <= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] != a_.f32[i]) || (b_.f32[i] != b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] != b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == a_.f32) & (b_.f32 == b_.f32) & (a_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] == a_.f32[i]) & (b_.f32[i] == b_.f32[i]) & (a_.f32[i] != b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 < b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] < b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 <= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] <= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ((a_.f32 == a_.f32) & (b_.f32 == b_.f32))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] == a_.f32[i]) & (b_.f32[i] == b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32) | (a_.f32 == b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] != a_.f32[i]) | (b_.f32[i] != b_.f32[i]) | (a_.f32[i] == b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 >= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] >= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 > b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] > b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - r_ = simde__m256_to_private(simde_mm256_setzero_ps()); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] >= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] > b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - r_ = simde__m256_to_private(simde_x_mm256_setone_ps()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde__m256_from_private(r_); -} -#if defined(__clang__) && defined(__AVX512DQ__) - #define simde_mm256_cmp_ps(a, b, imm8) (__extension__ ({ \ - simde__m256 simde_mm256_cmp_ps_r; \ - switch (imm8) { \ - case SIMDE_CMP_FALSE_OQ: \ - case SIMDE_CMP_FALSE_OS: \ - simde_mm256_cmp_ps_r = simde_mm256_setzero_ps(); \ - break; \ - case SIMDE_CMP_TRUE_UQ: \ - case SIMDE_CMP_TRUE_US: \ - simde_mm256_cmp_ps_r = simde_x_mm256_setone_ps(); \ - break; \ - default: \ - simde_mm256_cmp_ps_r = simde_mm256_cmp_ps_internal_(a, b, imm8); \ - break; \ - } \ - simde_mm256_cmp_ps_r; \ - })) -#elif defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_cmp_ps(a, b, imm8) _mm256_cmp_ps(a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) - #define simde_mm256_cmp_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256_private \ - simde_mm256_cmp_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ - simde_mm256_cmp_ps_a_ = simde__m256_to_private((a)), \ - simde_mm256_cmp_ps_b_ = simde__m256_to_private((b)); \ - \ - for (size_t i = 0 ; i < (sizeof(simde_mm256_cmp_ps_r_.m128) / sizeof(simde_mm256_cmp_ps_r_.m128[0])) ; i++) { \ - simde_mm256_cmp_ps_r_.m128[i] = simde_mm_cmp_ps(simde_mm256_cmp_ps_a_.m128[i], simde_mm256_cmp_ps_b_.m128[i], (imm8)); \ - } \ - \ - simde__m256_from_private(simde_mm256_cmp_ps_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmp_ps - #define _mm256_cmp_ps(a, b, imm8) simde_mm256_cmp_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_copysign_ps(simde__m256 dest, simde__m256 src) { - simde__m256_private - r_, - dest_ = simde__m256_to_private(dest), - src_ = simde__m256_to_private(src); - - #if defined(simde_math_copysignf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]); - } - #else - simde__m256 sgnbit = simde_mm256_xor_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0)), simde_mm256_set1_ps(-SIMDE_FLOAT32_C(0.0))); - return simde_mm256_xor_ps(simde_mm256_and_ps(sgnbit, src), simde_mm256_andnot_ps(sgnbit, dest)); - #endif - - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_copysign_pd(simde__m256d dest, simde__m256d src) { - simde__m256d_private - r_, - dest_ = simde__m256d_to_private(dest), - src_ = simde__m256d_to_private(src); - - #if defined(simde_math_copysign) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]); - } - #else - simde__m256d sgnbit = simde_mm256_xor_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0)), simde_mm256_set1_pd(-SIMDE_FLOAT64_C(0.0))); - return simde_mm256_xor_pd(simde_mm256_and_pd(sgnbit, src), simde_mm256_andnot_pd(sgnbit, dest)); - #endif - - return simde__m256d_from_private(r_); -} - -HEDLEY_DIAGNOSTIC_POP /* -Wfloat-equal */ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cvtepi32_pd (simde__m128i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtepi32_pd(a); - #else - simde__m256d_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = HEDLEY_STATIC_CAST(simde_float64, a_.i32[i]); - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi32_pd - #define _mm256_cvtepi32_pd(a) simde_mm256_cvtepi32_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 - simde_mm256_cvtepi32_ps (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtepi32_ps(a); - #else - simde__m256_private r_; - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.i32[i]); - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi32_ps - #define _mm256_cvtepi32_ps(a) simde_mm256_cvtepi32_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvtpd_epi32 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtpd_epi32(a); - #else - simde__m128i_private r_; - simde__m256d_private a_ = simde__m256d_to_private(a); - - #if defined(simde_math_nearbyint) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtpd_epi32 - #define _mm256_cvtpd_epi32(a) simde_mm256_cvtpd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_cvtpd_ps (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtpd_ps(a); - #else - simde__m128_private r_; - simde__m256d_private a_ = simde__m256d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.f64[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtpd_ps - #define _mm256_cvtpd_ps(a) simde_mm256_cvtpd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtps_epi32 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtps_epi32(a); - #else - simde__m256i_private r_; - simde__m256_private a_ = simde__m256_to_private(a); - - #if defined(simde_math_nearbyintf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtps_epi32 - #define _mm256_cvtps_epi32(a) simde_mm256_cvtps_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cvtps_pd (simde__m128 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtps_pd(a); - #else - simde__m256d_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f64[i] = HEDLEY_STATIC_CAST(double, a_.f32[i]); - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtps_pd - #define _mm256_cvtps_pd(a) simde_mm256_cvtps_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float64 -simde_mm256_cvtsd_f64 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \ - HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MSVC_VERSION_CHECK(19,14,0)) - return _mm256_cvtsd_f64(a); - #else - simde__m256d_private a_ = simde__m256d_to_private(a); - return a_.f64[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtsd_f64 - #define _mm256_cvtsd_f64(a) simde_mm256_cvtsd_f64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm256_cvtsi256_si32 (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MSVC_VERSION_CHECK(19,14,0)) - return _mm256_cvtsi256_si32(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i32[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtsi256_si32 - #define _mm256_cvtsi256_si32(a) simde_mm256_cvtsi256_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float32 -simde_mm256_cvtss_f32 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \ - HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MSVC_VERSION_CHECK(19,14,0)) - return _mm256_cvtss_f32(a); - #else - simde__m256_private a_ = simde__m256_to_private(a); - return a_.f32[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtss_f32 - #define _mm256_cvtss_f32(a) simde_mm256_cvtss_f32(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvttpd_epi32 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvttpd_epi32(a); - #else - simde__m128i_private r_; - simde__m256d_private a_ = simde__m256d_to_private(a); - - #if defined(simde_math_trunc) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_trunc(a_.f64[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvttpd_epi32 - #define _mm256_cvttpd_epi32(a) simde_mm256_cvttpd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvttps_epi32 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvttps_epi32(a); - #else - simde__m256i_private r_; - simde__m256_private a_ = simde__m256_to_private(a); - - #if defined(simde_math_truncf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_truncf(a_.f32[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvttps_epi32 - #define _mm256_cvttps_epi32(a) simde_mm256_cvttps_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_div_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_div_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_div_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 / b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] / b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_ps - #define _mm256_div_ps(a, b) simde_mm256_div_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_div_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_div_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_div_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 / b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] / b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_pd - #define _mm256_div_pd(a, b) simde_mm256_div_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm256_extractf128_pd (simde__m256d a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256d_private a_ = simde__m256d_to_private(a); - return a_.m128d[imm8]; -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_extractf128_pd(a, imm8) _mm256_extractf128_pd(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_extractf128_pd - #define _mm256_extractf128_pd(a, imm8) simde_mm256_extractf128_pd(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_extractf128_ps (simde__m256 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256_private a_ = simde__m256_to_private(a); - return a_.m128[imm8]; -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_extractf128_ps(a, imm8) _mm256_extractf128_ps(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_extractf128_ps - #define _mm256_extractf128_ps(a, imm8) simde_mm256_extractf128_ps(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_extractf128_si256 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.m128i[imm8]; -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_extractf128_si256(a, imm8) _mm256_extractf128_si256(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_extractf128_si256 - #define _mm256_extractf128_si256(a, imm8) simde_mm256_extractf128_si256(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_floor_pd (simde__m256d a) { - return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_floor_pd - #define _mm256_floor_pd(a) simde_mm256_floor_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_floor_ps (simde__m256 a) { - return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_floor_ps - #define _mm256_floor_ps(a) simde_mm256_floor_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_insert_epi8 (simde__m256i a, int8_t i, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 31) { - simde__m256i_private a_ = simde__m256i_to_private(a); - - a_.i8[index] = i; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_insert_epi8(a, i, index) _mm256_insert_epi8(a, i, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insert_epi8 - #define _mm256_insert_epi8(a, i, index) simde_mm256_insert_epi8(a, i, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_insert_epi16 (simde__m256i a, int16_t i, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 15) { - simde__m256i_private a_ = simde__m256i_to_private(a); - - a_.i16[index] = i; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_insert_epi16(a, i, index) _mm256_insert_epi16(a, i, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insert_epi16 - #define _mm256_insert_epi16(a, i, imm8) simde_mm256_insert_epi16(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_insert_epi32 (simde__m256i a, int32_t i, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 7) { - simde__m256i_private a_ = simde__m256i_to_private(a); - - a_.i32[index] = i; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_insert_epi32(a, i, index) _mm256_insert_epi32(a, i, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insert_epi32 - #define _mm256_insert_epi32(a, i, index) simde_mm256_insert_epi32(a, i, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_insert_epi64 (simde__m256i a, int64_t i, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 3) { - simde__m256i_private a_ = simde__m256i_to_private(a); - - a_.i64[index] = i; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) - #define simde_mm256_insert_epi64(a, i, index) _mm256_insert_epi64(a, i, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #undef _mm256_insert_epi64 - #define _mm256_insert_epi64(a, i, index) simde_mm256_insert_epi64(a, i, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d simde_mm256_insertf128_pd(simde__m256d a, simde__m128d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256d_private a_ = simde__m256d_to_private(a); - simde__m128d_private b_ = simde__m128d_to_private(b); - - a_.m128d_private[imm8] = b_; - - return simde__m256d_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_insertf128_pd(a, b, imm8) _mm256_insertf128_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insertf128_pd - #define _mm256_insertf128_pd(a, b, imm8) simde_mm256_insertf128_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 simde_mm256_insertf128_ps(simde__m256 a, simde__m128 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256_private a_ = simde__m256_to_private(a); - simde__m128_private b_ = simde__m128_to_private(b); - - a_.m128_private[imm8] = b_; - - return simde__m256_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_insertf128_ps(a, b, imm8) _mm256_insertf128_ps(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insertf128_ps - #define _mm256_insertf128_ps(a, b, imm8) simde_mm256_insertf128_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i simde_mm256_insertf128_si256(simde__m256i a, simde__m128i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256i_private a_ = simde__m256i_to_private(a); - simde__m128i_private b_ = simde__m128i_to_private(b); - - a_.m128i_private[imm8] = b_; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_insertf128_si256(a, b, imm8) _mm256_insertf128_si256(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insertf128_si256 - #define _mm256_insertf128_si256(a, b, imm8) simde_mm256_insertf128_si256(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_dp_ps(a, b, imm8) _mm256_dp_ps(a, b, imm8) -#else -# define simde_mm256_dp_ps(a, b, imm8) \ - simde_mm256_set_m128( \ - simde_mm_dp_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), imm8), \ - simde_mm_dp_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), imm8)) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_dp_ps - #define _mm256_dp_ps(a, b, imm8) simde_mm256_dp_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm256_extract_epi32 (simde__m256i a, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 7) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i32[index]; -} -#if defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_extract_epi32(a, index) _mm256_extract_epi32(a, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_extract_epi32 - #define _mm256_extract_epi32(a, index) simde_mm256_extract_epi32(a, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm256_extract_epi64 (simde__m256i a, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 3) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i64[index]; -} -#if defined(SIMDE_X86_AVX_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0) - #define simde_mm256_extract_epi64(a, index) _mm256_extract_epi64(a, index) - #endif -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #undef _mm256_extract_epi64 - #define _mm256_extract_epi64(a, index) simde_mm256_extract_epi64(a, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_lddqu_si256 (simde__m256i const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(mem_addr); - #else - simde__m256i r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_lddqu_si256 - #define _mm256_lddqu_si256(a) simde_mm256_lddqu_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_load_pd (const double mem_addr[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_load_pd(mem_addr); - #else - simde__m256d r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_load_pd - #define _mm256_load_pd(a) simde_mm256_load_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_load_ps (const float mem_addr[HEDLEY_ARRAY_PARAM(8)]) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_load_ps(mem_addr); - #else - simde__m256 r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_load_ps - #define _mm256_load_ps(a) simde_mm256_load_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_load_si256 (simde__m256i const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_load_si256(mem_addr); - #else - simde__m256i r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_load_si256 - #define _mm256_load_si256(a) simde_mm256_load_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_loadu_pd (const double a[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_pd(a); - #else - simde__m256d r; - simde_memcpy(&r, a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu_pd - #define _mm256_loadu_pd(a) simde_mm256_loadu_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_loadu_ps (const float a[HEDLEY_ARRAY_PARAM(8)]) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_ps(a); - #else - simde__m256 r; - simde_memcpy(&r, a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu_ps - #define _mm256_loadu_ps(a) simde_mm256_loadu_ps(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ - && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm256_loadu_epi8(mem_addr) _mm256_loadu_epi8(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_epi8(void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#endif -#define simde_x_mm256_loadu_epi8(mem_addr) simde_mm256_loadu_epi8(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm256_loadu_epi8 - #define _mm256_loadu_epi8(a) simde_mm256_loadu_epi8(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ - && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm256_loadu_epi16(mem_addr) _mm256_loadu_epi16(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_epi16(void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#endif -#define simde_x_mm256_loadu_epi16(mem_addr) simde_mm256_loadu_epi16(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm256_loadu_epi16 - #define _mm256_loadu_epi16(a) simde_mm256_loadu_epi16(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ - && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm256_loadu_epi32(mem_addr) _mm256_loadu_epi32(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_epi32(void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#endif -#define simde_x_mm256_loadu_epi32(mem_addr) simde_mm256_loadu_epi32(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm256_loadu_epi32 - #define _mm256_loadu_epi32(a) simde_mm256_loadu_epi32(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ - && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm256_loadu_epi64(mem_addr) _mm256_loadu_epi64(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_epi64(void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#endif -#define simde_x_mm256_loadu_epi64(mem_addr) simde_mm256_loadu_epi64(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm256_loadu_epi64 - #define _mm256_loadu_epi64(a) simde_mm256_loadu_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_si256 (void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(const __m256i*, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu_si256 - #define _mm256_loadu_si256(mem_addr) simde_mm256_loadu_si256(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_loadu2_m128 (const float hiaddr[HEDLEY_ARRAY_PARAM(4)], const float loaddr[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - return _mm256_loadu2_m128(hiaddr, loaddr); - #else - return - simde_mm256_insertf128_ps(simde_mm256_castps128_ps256(simde_mm_loadu_ps(loaddr)), - simde_mm_loadu_ps(hiaddr), 1); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu2_m128 - #define _mm256_loadu2_m128(hiaddr, loaddr) simde_mm256_loadu2_m128(hiaddr, loaddr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_loadu2_m128d (const double hiaddr[HEDLEY_ARRAY_PARAM(2)], const double loaddr[HEDLEY_ARRAY_PARAM(2)]) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - return _mm256_loadu2_m128d(hiaddr, loaddr); - #else - return - simde_mm256_insertf128_pd(simde_mm256_castpd128_pd256(simde_mm_loadu_pd(loaddr)), - simde_mm_loadu_pd(hiaddr), 1); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu2_m128d - #define _mm256_loadu2_m128d(hiaddr, loaddr) simde_mm256_loadu2_m128d(hiaddr, loaddr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu2_m128i (const simde__m128i* hiaddr, const simde__m128i* loaddr) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - return _mm256_loadu2_m128i(hiaddr, loaddr); - #else - return - simde_mm256_insertf128_si256(simde_mm256_castsi128_si256(simde_mm_loadu_si128(loaddr)), - simde_mm_loadu_si128(hiaddr), 1); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu2_m128i - #define _mm256_loadu2_m128i(hiaddr, loaddr) simde_mm256_loadu2_m128i(hiaddr, loaddr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm_maskload_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128d, mask)); - #else - return _mm_maskload_pd(mem_addr, mask); - #endif - #else - simde__m128d_private r_; - simde__m128i_private - mask_ = simde__m128i_to_private(mask), - mask_shr_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_shr_.neon_i64 = vshrq_n_s64(mask_.neon_i64, 63); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde_mm_and_pd(simde_mm_load_pd(mem_addr), - simde__m128d_from_wasm_v128(wasm_i64x2_shr(mask_.wasm_v128, 63))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(mask_.i64) / sizeof(mask_.i64[0])) ; i++) { - mask_shr_.i64[i] = mask_.i64[i] >> 63; - } - #endif - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = mask_shr_.i64[i] ? mem_addr[i] : SIMDE_FLOAT64_C(0.0); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_maskload_pd - #define _mm_maskload_pd(mem_addr, mask) simde_mm_maskload_pd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm256_maskload_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256d, mask)); - #else - return _mm256_maskload_pd(mem_addr, mask); - #endif - #else - simde__m256d_private r_; - simde__m256i_private mask_ = simde__m256i_to_private(mask); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (mask_.i64[i] >> 63) ? mem_addr[i] : SIMDE_FLOAT64_C(0.0); - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskload_pd - #define _mm256_maskload_pd(mem_addr, mask) simde_mm256_maskload_pd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm_maskload_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128, mask)); - #else - return _mm_maskload_ps(mem_addr, mask); - #endif - #else - simde__m128_private r_; - simde__m128i_private - mask_ = simde__m128i_to_private(mask), - mask_shr_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_shr_.neon_i32 = vshrq_n_s32(mask_.neon_i32, 31); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde_mm_and_ps(simde_mm_load_ps(mem_addr), - simde__m128_from_wasm_v128(wasm_i32x4_shr(mask_.wasm_v128, 31))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(mask_.i32) / sizeof(mask_.i32[0])) ; i++) { - mask_shr_.i32[i] = mask_.i32[i] >> 31; - } - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = mask_shr_.i32[i] ? mem_addr[i] : SIMDE_FLOAT32_C(0.0); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_maskload_ps - #define _mm_maskload_ps(mem_addr, mask) simde_mm_maskload_ps(HEDLEY_REINTERPRET_CAST(float const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm256_maskload_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256, mask)); - #else - return _mm256_maskload_ps(mem_addr, mask); - #endif - #else - simde__m256_private r_; - simde__m256i_private mask_ = simde__m256i_to_private(mask); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (mask_.i32[i] >> 31) ? mem_addr[i] : SIMDE_FLOAT32_C(0.0); - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskload_ps - #define _mm256_maskload_ps(mem_addr, mask) simde_mm256_maskload_ps(HEDLEY_REINTERPRET_CAST(float const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask, simde__m128d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - _mm_maskstore_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128d, mask), a); - #else - _mm_maskstore_pd(mem_addr, mask, a); - #endif - #else - simde__m128i_private mask_ = simde__m128i_to_private(mask); - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(mask_.wasm_v128, 0)) & 0x8000000000000000ull) != 0) - mem_addr[0] = wasm_f64x2_extract_lane(a_.wasm_v128, 0); - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(mask_.wasm_v128, 1)) & 0x8000000000000000ull) != 0) - mem_addr[1] = wasm_f64x2_extract_lane(a_.wasm_v128, 1); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if (mask_.u64[i] >> 63) - mem_addr[i] = a_.f64[i]; - } - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_maskstore_pd - #define _mm_maskstore_pd(mem_addr, mask, a) simde_mm_maskstore_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask, simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - _mm256_maskstore_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256d, mask), a); - #else - _mm256_maskstore_pd(mem_addr, mask, a); - #endif - #else - simde__m256i_private mask_ = simde__m256i_to_private(mask); - simde__m256d_private a_ = simde__m256d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if (mask_.u64[i] & (UINT64_C(1) << 63)) - mem_addr[i] = a_.f64[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskstore_pd - #define _mm256_maskstore_pd(mem_addr, mask, a) simde_mm256_maskstore_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask, simde__m128 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - _mm_maskstore_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128, mask), a); - #else - _mm_maskstore_ps(mem_addr, mask, a); - #endif - #else - simde__m128i_private mask_ = simde__m128i_to_private(mask); - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 0)) & 0x80000000ull) != 0) - mem_addr[0] = wasm_f32x4_extract_lane(a_.wasm_v128, 0); - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 1)) & 0x80000000ull) != 0) - mem_addr[1] = wasm_f32x4_extract_lane(a_.wasm_v128, 1); - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 2)) & 0x80000000ull) != 0) - mem_addr[2] = wasm_f32x4_extract_lane(a_.wasm_v128, 2); - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 3)) & 0x80000000ull) != 0) - mem_addr[3] = wasm_f32x4_extract_lane(a_.wasm_v128, 3); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.f32[i]; - } - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_maskstore_ps - #define _mm_maskstore_ps(mem_addr, mask, a) simde_mm_maskstore_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask, simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - _mm256_maskstore_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256, mask), a); - #else - _mm256_maskstore_ps(mem_addr, mask, a); - #endif - #else - simde__m256i_private mask_ = simde__m256i_to_private(mask); - simde__m256_private a_ = simde__m256_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.f32[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskstore_ps - #define _mm256_maskstore_ps(mem_addr, mask, a) simde_mm256_maskstore_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_min_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_min_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_min_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_min_ps(a_.m128[1], b_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_ps - #define _mm256_min_ps(a, b) simde_mm256_min_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_min_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_min_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_min_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_min_pd(a_.m128d[1], b_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_pd - #define _mm256_min_pd(a, b) simde_mm256_min_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_max_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_max_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_max_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_max_ps(a_.m128[1], b_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_ps - #define _mm256_max_ps(a, b) simde_mm256_max_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_max_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_max_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_max_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_max_pd(a_.m128d[1], b_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_pd - #define _mm256_max_pd(a, b) simde_mm256_max_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_movedup_pd (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_movedup_pd(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, a_.f64, 0, 0, 2, 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[i] = r_.f64[i + 1] = a_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_movedup_pd - #define _mm256_movedup_pd(a) simde_mm256_movedup_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_movehdup_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_movehdup_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, a_.f32, 1, 1, 3, 3, 5, 5, 7, 7); - #else - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[i - 1] = r_.f32[i] = a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_movehdup_ps - #define _mm256_movehdup_ps(a) simde_mm256_movehdup_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_moveldup_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_moveldup_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, a_.f32, 0, 0, 2, 2, 4, 4, 6, 6); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[i] = r_.f32[i + 1] = a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_moveldup_ps - #define _mm256_moveldup_ps(a) simde_mm256_moveldup_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_movemask_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_movemask_ps(a); - #else - simde__m256_private a_ = simde__m256_to_private(a); - int r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r |= (a_.u32[i] >> 31) << i; - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_movemask_ps - #define _mm256_movemask_ps(a) simde_mm256_movemask_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_movemask_pd (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_movemask_pd(a); - #else - simde__m256d_private a_ = simde__m256d_to_private(a); - int r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r |= (a_.u64[i] >> 63) << i; - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_movemask_pd - #define _mm256_movemask_pd(a) simde_mm256_movemask_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mul_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_mul_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_mul_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_mul_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 * b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] * b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_mul_ps - #define _mm256_mul_ps(a, b) simde_mm256_mul_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mul_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_mul_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_mul_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_mul_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 * b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] * b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_mul_pd - #define _mm256_mul_pd(a, b) simde_mm256_mul_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_or_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_or_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_or_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_or_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] | b_.u32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_or_ps - #define _mm256_or_ps(a, b) simde_mm256_or_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_or_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_or_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_or_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_or_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] | b_.u64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_or_pd - #define _mm256_or_pd(a, b) simde_mm256_or_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permute_ps (simde__m256 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.m128_private[i >> 2].f32[(imm8 >> ((i << 1) & 7)) & 3]; - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute_ps(a, imm8) _mm256_permute_ps(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute_ps - #define _mm256_permute_ps(a, imm8) simde_mm256_permute_ps(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permute_pd (simde__m256d a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)]; - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute_pd(a, imm8) _mm256_permute_pd(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute_pd - #define _mm256_permute_pd(a, imm8) simde_mm256_permute_pd(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_permute_ps (simde__m128 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[(imm8 >> ((i << 1) & 7)) & 3]; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm_permute_ps(a, imm8) _mm_permute_ps(a, imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_permute_ps(a, imm8) simde__m128_from_wasm_v128(wasm_i32x4_shuffle(simde__m128_to_wasm_v128(a), simde__m128_to_wasm_v128(a), ((imm8) & 3), (((imm8) >> 2) & 3 ), (((imm8) >> 4) & 3), (((imm8) >> 6) & 3))) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_permute_ps - #define _mm_permute_ps(a, imm8) simde_mm_permute_ps(a, imm8) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_permute_pd (simde__m128d a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)]; - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm_permute_pd(a, imm8) _mm_permute_pd(a, imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_permute_pd(a, imm8) simde__m128d_from_wasm_v128(wasm_i64x2_shuffle(simde__m128d_to_wasm_v128(a), simde__m128d_to_wasm_v128(a), ((imm8) & 1), (((imm8) >> 1) & 1 ))) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_permute_pd - #define _mm_permute_pd(a, imm8) simde_mm_permute_pd(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_permutevar_ps (simde__m128 a, simde__m128i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_permutevar_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - simde__m128i_private b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_make( - (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 0) & 3]), - (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 1) & 3]), - (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 2) & 3]), - (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 3) & 3])); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[b_.i32[i] & 3]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_permutevar_ps - #define _mm_permutevar_ps(a, b) simde_mm_permutevar_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_permutevar_pd (simde__m128d a, simde__m128i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_permutevar_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - simde__m128i_private b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_make( - (a_.f64[(wasm_i64x2_extract_lane(b_.wasm_v128, 0) >> 1) & 1]), - (a_.f64[(wasm_i64x2_extract_lane(b_.wasm_v128, 1) >> 1) & 1])); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[(b_.i64[i] & 2) >> 1]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_permutevar_pd - #define _mm_permutevar_pd(a, b) simde_mm_permutevar_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permutevar_ps (simde__m256 a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_permutevar_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - simde__m256i_private b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[(b_.i32[i] & 3) + (i & 4)]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutevar_ps - #define _mm256_permutevar_ps(a, b) simde_mm256_permutevar_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permutevar_pd (simde__m256d a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_permutevar_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - simde__m256i_private b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[((b_.i64[i] & 2) >> 1) + (i & 2)]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutevar_pd - #define _mm256_permutevar_pd(a, b) simde_mm256_permutevar_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permute2f128_ps (simde__m256 a, simde__m256 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - r_.m128_private[0] = (imm8 & 0x08) ? simde__m128_to_private(simde_mm_setzero_ps()) : ((imm8 & 0x02) ? b_.m128_private[(imm8 ) & 1] : a_.m128_private[(imm8 ) & 1]); - r_.m128_private[1] = (imm8 & 0x80) ? simde__m128_to_private(simde_mm_setzero_ps()) : ((imm8 & 0x20) ? b_.m128_private[(imm8 >> 4) & 1] : a_.m128_private[(imm8 >> 4) & 1]); - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute2f128_ps(a, b, imm8) _mm256_permute2f128_ps(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute2f128_ps - #define _mm256_permute2f128_ps(a, b, imm8) simde_mm256_permute2f128_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permute2f128_pd (simde__m256d a, simde__m256d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - r_.m128d_private[0] = (imm8 & 0x08) ? simde__m128d_to_private(simde_mm_setzero_pd()) : ((imm8 & 0x02) ? b_.m128d_private[(imm8 ) & 1] : a_.m128d_private[(imm8 ) & 1]); - r_.m128d_private[1] = (imm8 & 0x80) ? simde__m128d_to_private(simde_mm_setzero_pd()) : ((imm8 & 0x20) ? b_.m128d_private[(imm8 >> 4) & 1] : a_.m128d_private[(imm8 >> 4) & 1]); - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute2f128_pd(a, b, imm8) _mm256_permute2f128_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute2f128_pd - #define _mm256_permute2f128_pd(a, b, imm8) simde_mm256_permute2f128_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permute2f128_si256 (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8 ) & 1] : a_.m128i_private[(imm8 ) & 1]); - r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]); - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute2f128_si128(a, b, imm8) _mm256_permute2f128_si128(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute2f128_si256 - #define _mm256_permute2f128_si256(a, b, imm8) simde_mm256_permute2f128_si256(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_rcp_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rcp_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_rcp_ps(a_.m128[0]); - r_.m128[1] = simde_mm_rcp_ps(a_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = SIMDE_FLOAT32_C(1.0) / a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_rcp_ps - #define _mm256_rcp_ps(a) simde_mm256_rcp_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_rsqrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rsqrt_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(simde_math_sqrtf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_rsqrt_ps - #define _mm256_rsqrt_ps(a) simde_mm256_rsqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_epi8 ( - int8_t e31, int8_t e30, int8_t e29, int8_t e28, int8_t e27, int8_t e26, int8_t e25, int8_t e24, - int8_t e23, int8_t e22, int8_t e21, int8_t e20, int8_t e19, int8_t e18, int8_t e17, int8_t e16, - int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, - int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_epi8( - e31, e30, e29, e28, e27, e26, e25, e24, - e23, e22, e21, e20, e19, e18, e17, e16, - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm256_set_epi8( - e0, e1, e2, e3, e4, e5, e6, e7, - e8, e9, e10, e11, e12, e13, e14, e15, - e16, e17, e18, e19, e20, e21, e22, e23, - e24, e25, e26, e27, e28, e29, e30, e31); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_epi8 - #define _mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_epi16 ( - int16_t e15, int16_t e14, int16_t e13, int16_t e12, int16_t e11, int16_t e10, int16_t e9, int16_t e8, - int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_epi16( - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm256_set_epi16( - e0, e1, e2, e3, e4, e5, e6, e7, - e8, e9, e10, e11, e12, e13, e14, e15); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_epi16 - #define _mm256_setr_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_setr_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_epi32 ( - int32_t e7, int32_t e6, int32_t e5, int32_t e4, int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm256_set_epi32(e0, e1, e2, e3, e4, e5, e6, e7); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_epi32 - #define _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_epi64x(e3, e2, e1, e0); - #else - return simde_mm256_set_epi64x(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_epi64x - #define _mm256_setr_epi64x(e3, e2, e1, e0) \ - simde_mm256_setr_epi64x(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_setr_ps ( - simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4, - simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm256_set_ps(e0, e1, e2, e3, e4, e5, e6, e7); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_ps - #define _mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_setr_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_pd(e3, e2, e1, e0); - #else - return simde_mm256_set_pd(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_pd - #define _mm256_setr_pd(e3, e2, e1, e0) \ - simde_mm256_setr_pd(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_setr_m128 (simde__m128 lo, simde__m128 hi) { - #if defined(SIMDE_X86_AVX_NATIVE) && \ - !defined(SIMDE_BUG_GCC_REV_247851) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0) - return _mm256_setr_m128(lo, hi); - #else - return simde_mm256_set_m128(hi, lo); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_m128 - #define _mm256_setr_m128(lo, hi) \ - simde_mm256_setr_m128(lo, hi) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_setr_m128d (simde__m128d lo, simde__m128d hi) { - #if defined(SIMDE_X86_AVX_NATIVE) && \ - !defined(SIMDE_BUG_GCC_REV_247851) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0) - return _mm256_setr_m128d(lo, hi); - #else - return simde_mm256_set_m128d(hi, lo); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_m128d - #define _mm256_setr_m128d(lo, hi) \ - simde_mm256_setr_m128d(lo, hi) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_m128i (simde__m128i lo, simde__m128i hi) { - #if defined(SIMDE_X86_AVX_NATIVE) && \ - !defined(SIMDE_BUG_GCC_REV_247851) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0) - return _mm256_setr_m128i(lo, hi); - #else - return simde_mm256_set_m128i(hi, lo); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_m128i - #define _mm256_setr_m128i(lo, hi) \ - simde_mm256_setr_m128i(lo, hi) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_shuffle_ps (simde__m256 a, simde__m256 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - r_.f32[0] = a_.m128_private[0].f32[(imm8 >> 0) & 3]; - r_.f32[1] = a_.m128_private[0].f32[(imm8 >> 2) & 3]; - r_.f32[2] = b_.m128_private[0].f32[(imm8 >> 4) & 3]; - r_.f32[3] = b_.m128_private[0].f32[(imm8 >> 6) & 3]; - r_.f32[4] = a_.m128_private[1].f32[(imm8 >> 0) & 3]; - r_.f32[5] = a_.m128_private[1].f32[(imm8 >> 2) & 3]; - r_.f32[6] = b_.m128_private[1].f32[(imm8 >> 4) & 3]; - r_.f32[7] = b_.m128_private[1].f32[(imm8 >> 6) & 3]; - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_shuffle_ps(a, b, imm8) _mm256_shuffle_ps(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - #define simde_mm256_shuffle_ps(a, b, imm8) \ - simde_mm256_set_m128( \ - simde_mm_shuffle_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8)), \ - simde_mm_shuffle_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), (imm8))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm256_shuffle_ps(a, b, imm8) \ - SIMDE_SHUFFLE_VECTOR_(32, 32, a, b, \ - (((imm8) >> 0) & 3) + 0, \ - (((imm8) >> 2) & 3) + 0, \ - (((imm8) >> 4) & 3) + 8, \ - (((imm8) >> 6) & 3) + 8, \ - (((imm8) >> 0) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 12, \ - (((imm8) >> 6) & 3) + 12) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_shuffle_ps - #define _mm256_shuffle_ps(a, b, imm8) simde_mm256_shuffle_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_shuffle_pd (simde__m256d a, simde__m256d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - r_.f64[0] = a_.f64[((imm8 ) & 1) ]; - r_.f64[1] = b_.f64[((imm8 >> 1) & 1) ]; - r_.f64[2] = a_.f64[((imm8 >> 2) & 1) | 2]; - r_.f64[3] = b_.f64[((imm8 >> 3) & 1) | 2]; - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_shuffle_pd(a, b, imm8) _mm256_shuffle_pd(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - #define simde_mm256_shuffle_pd(a, b, imm8) \ - simde_mm256_set_m128d( \ - simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8 >> 2) & 3), \ - simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8 >> 0) & 3)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm256_shuffle_pd(a, b, imm8) \ - SIMDE_SHUFFLE_VECTOR_(64, 32, a, b, \ - (((imm8) >> 0) & 1) + 0, \ - (((imm8) >> 1) & 1) + 4, \ - (((imm8) >> 2) & 1) + 2, \ - (((imm8) >> 3) & 1) + 6) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_shuffle_pd - #define _mm256_shuffle_pd(a, b, imm8) simde_mm256_shuffle_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_sqrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sqrt_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_sqrt_ps(a_.m128[0]); - r_.m128[1] = simde_mm_sqrt_ps(a_.m128[1]); - #elif defined(simde_math_sqrtf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sqrtf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_sqrt_ps - #define _mm256_sqrt_ps(a) simde_mm256_sqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_sqrt_pd (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sqrt_pd(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_sqrt_pd(a_.m128d[0]); - r_.m128d[1] = simde_mm_sqrt_pd(a_.m128d[1]); - #elif defined(simde_math_sqrt) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sqrt(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_sqrt_pd - #define _mm256_sqrt_pd(a) simde_mm256_sqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_store_ps (simde_float32 mem_addr[8], simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_store_ps(mem_addr, a); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_store_ps - #define _mm256_store_ps(mem_addr, a) simde_mm256_store_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_store_pd (simde_float64 mem_addr[4], simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_store_pd(mem_addr, a); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_store_pd - #define _mm256_store_pd(mem_addr, a) simde_mm256_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_store_si256 (simde__m256i* mem_addr, simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_store_si256(mem_addr, a); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_store_si256 - #define _mm256_store_si256(mem_addr, a) simde_mm256_store_si256(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu_ps (simde_float32 mem_addr[8], simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_storeu_ps(mem_addr, a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu_ps - #define _mm256_storeu_ps(mem_addr, a) simde_mm256_storeu_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu_pd (simde_float64 mem_addr[4], simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_storeu_pd(mem_addr, a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu_pd - #define _mm256_storeu_pd(mem_addr, a) simde_mm256_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu_si256 (void* mem_addr, simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_storeu_si256(SIMDE_ALIGN_CAST(__m256i*, mem_addr), a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu_si256 - #define _mm256_storeu_si256(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu2_m128 (simde_float32 hi_addr[4], simde_float32 lo_addr[4], simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - _mm256_storeu2_m128(hi_addr, lo_addr, a); - #else - simde_mm_storeu_ps(lo_addr, simde_mm256_castps256_ps128(a)); - simde_mm_storeu_ps(hi_addr, simde_mm256_extractf128_ps(a, 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu2_m128 - #define _mm256_storeu2_m128(hi_addr, lo_addr, a) simde_mm256_storeu2_m128(hi_addr, lo_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu2_m128d (simde_float64 hi_addr[2], simde_float64 lo_addr[2], simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - _mm256_storeu2_m128d(hi_addr, lo_addr, a); - #else - simde_mm_storeu_pd(lo_addr, simde_mm256_castpd256_pd128(a)); - simde_mm_storeu_pd(hi_addr, simde_mm256_extractf128_pd(a, 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu2_m128d - #define _mm256_storeu2_m128d(hi_addr, lo_addr, a) simde_mm256_storeu2_m128d(hi_addr, lo_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu2_m128i (simde__m128i* hi_addr, simde__m128i* lo_addr, simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - _mm256_storeu2_m128i(hi_addr, lo_addr, a); - #else - simde_mm_storeu_si128(lo_addr, simde_mm256_castsi256_si128(a)); - simde_mm_storeu_si128(hi_addr, simde_mm256_extractf128_si256(a, 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu2_m128i - #define _mm256_storeu2_m128i(hi_addr, lo_addr, a) simde_mm256_storeu2_m128i(hi_addr, lo_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_stream_ps(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_stream_ps - #define _mm256_stream_ps(mem_addr, a) simde_mm256_stream_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_stream_pd(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_stream_pd - #define _mm256_stream_pd(mem_addr, a) simde_mm256_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_stream_si256(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_stream_si256 - #define _mm256_stream_si256(mem_addr, a) simde_mm256_stream_si256(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_sub_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sub_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_sub_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_sub_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 - b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] - b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_ps - #define _mm256_sub_ps(a, b) simde_mm256_sub_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_hsub_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hsub_ps(a, b); - #else - return simde_mm256_sub_ps(simde_x_mm256_deinterleaveeven_ps(a, b), simde_x_mm256_deinterleaveodd_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsub_ps - #define _mm256_hsub_ps(a, b) simde_mm256_hsub_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_sub_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sub_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_sub_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_sub_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 - b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] - b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_pd - #define _mm256_sub_pd(a, b) simde_mm256_sub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_hsub_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hsub_pd(a, b); - #else - return simde_mm256_sub_pd(simde_x_mm256_deinterleaveeven_pd(a, b), simde_x_mm256_deinterleaveodd_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsub_pd - #define _mm256_hsub_pd(a, b) simde_mm256_hsub_pd(a, b) -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_undefined_ps (void) { - simde__m256_private r_; - -#if \ - defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \ - (!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256)) - r_.n = _mm256_undefined_ps(); -#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m256_to_private(simde_mm256_setzero_ps()); -#endif - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_undefined_ps - #define _mm256_undefined_ps() simde_mm256_undefined_ps() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_undefined_pd (void) { - simde__m256d_private r_; - -#if \ - defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \ - (!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256)) - r_.n = _mm256_undefined_pd(); -#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m256d_to_private(simde_mm256_setzero_pd()); -#endif - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_undefined_pd - #define _mm256_undefined_pd() simde_mm256_undefined_pd() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_undefined_si256 (void) { - simde__m256i_private r_; -#if \ - defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \ - (!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256)) - r_.n = _mm256_undefined_si256(); -#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); -#endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_undefined_si256 - #define _mm256_undefined_si256() simde_mm256_undefined_si256() -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_POP -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_xor_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_xor_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_xor_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_xor_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] ^ b_.u32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_xor_ps - #define _mm256_xor_ps(a, b) simde_mm256_xor_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_xor_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_xor_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_xor_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_xor_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] ^ b_.u64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_xor_pd - #define _mm256_xor_pd(a, b) simde_mm256_xor_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_xorsign_ps(simde__m256 dest, simde__m256 src) { - return simde_mm256_xor_ps(simde_mm256_and_ps(simde_mm256_set1_ps(-0.0f), src), dest); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_xorsign_pd(simde__m256d dest, simde__m256d src) { - return simde_mm256_xor_pd(simde_mm256_and_pd(simde_mm256_set1_pd(-0.0), src), dest); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_negate_ps(simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return simde_mm256_xor_ps(a,_mm256_set1_ps(SIMDE_FLOAT32_C(-0.0))); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(SIMDE_VECTOR_NEGATE) - r_.f32 = -a_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_negate_pd(simde__m256d a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return simde_mm256_xor_pd(a, _mm256_set1_pd(SIMDE_FLOAT64_C(-0.0))); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if defined(SIMDE_VECTOR_NEGATE) - r_.f64 = -a_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -a_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_unpackhi_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_unpackhi_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 2, 10, 3, 11, 6, 14, 7, 15); - #else - r_.f32[0] = a_.f32[2]; - r_.f32[1] = b_.f32[2]; - r_.f32[2] = a_.f32[3]; - r_.f32[3] = b_.f32[3]; - r_.f32[4] = a_.f32[6]; - r_.f32[5] = b_.f32[6]; - r_.f32[6] = a_.f32[7]; - r_.f32[7] = b_.f32[7]; - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_ps - #define _mm256_unpackhi_ps(a, b) simde_mm256_unpackhi_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_unpackhi_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_unpackhi_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7); - #else - r_.f64[0] = a_.f64[1]; - r_.f64[1] = b_.f64[1]; - r_.f64[2] = a_.f64[3]; - r_.f64[3] = b_.f64[3]; - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_pd - #define _mm256_unpackhi_pd(a, b) simde_mm256_unpackhi_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_unpacklo_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_unpacklo_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 8, 1, 9, 4, 12, 5, 13); - #else - r_.f32[0] = a_.f32[0]; - r_.f32[1] = b_.f32[0]; - r_.f32[2] = a_.f32[1]; - r_.f32[3] = b_.f32[1]; - r_.f32[4] = a_.f32[4]; - r_.f32[5] = b_.f32[4]; - r_.f32[6] = a_.f32[5]; - r_.f32[7] = b_.f32[5]; - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_ps - #define _mm256_unpacklo_ps(a, b) simde_mm256_unpacklo_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_unpacklo_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_unpacklo_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6); - #else - r_.f64[0] = a_.f64[0]; - r_.f64[1] = b_.f64[0]; - r_.f64[2] = a_.f64[2]; - r_.f64[3] = b_.f64[2]; - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_pd - #define _mm256_unpacklo_pd(a, b) simde_mm256_unpacklo_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_zextps128_ps256 (simde__m128 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_ps(_mm256_setzero_ps(), a, 0); - #else - simde__m256_private r_; - - r_.m128_private[0] = simde__m128_to_private(a); - r_.m128_private[1] = simde__m128_to_private(simde_mm_setzero_ps()); - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_zextps128_ps256 - #define _mm256_zextps128_ps256(a) simde_mm256_zextps128_ps256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_zextpd128_pd256 (simde__m128d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_pd(_mm256_setzero_pd(), a, 0); - #else - simde__m256d_private r_; - - r_.m128d_private[0] = simde__m128d_to_private(a); - r_.m128d_private[1] = simde__m128d_to_private(simde_mm_setzero_pd()); - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_zextpd128_pd256 - #define _mm256_zextpd128_pd256(a) simde_mm256_zextpd128_pd256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_zextsi128_si256 (simde__m128i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_si256(_mm256_setzero_si256(), a, 0); - #else - simde__m256i_private r_; - - r_.m128i_private[0] = simde__m128i_to_private(a); - r_.m128i_private[1] = simde__m128i_to_private(simde_mm_setzero_si128()); - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_zextsi128_si256 - #define _mm256_zextsi128_si256(a) simde_mm256_zextsi128_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testc_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testc_ps(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u32x4_shr(wasm_v128_or(wasm_v128_not(b_.wasm_v128), a_.wasm_v128), 31); - m = wasm_v128_and(m, simde_mm_movehl_ps(m, m)); - m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); - return wasm_i32x4_extract_lane(m, 0); - #else - uint_fast32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= ~a_.u32[i] & b_.u32[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testc_ps - #define _mm_testc_ps(a, b) simde_mm_testc_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testc_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testc_pd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u64x2_shr(wasm_v128_or(wasm_v128_not(b_.wasm_v128), a_.wasm_v128), 63); - return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1)); - #else - uint_fast64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= ~a_.u64[i] & b_.u64[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testc_pd - #define _mm_testc_pd(a, b) simde_mm_testc_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testc_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testc_ps(a, b); - #else - uint_fast32_t r = 0; - simde__m256_private - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= ~a_.u32[i] & b_.u32[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testc_ps - #define _mm256_testc_ps(a, b) simde_mm256_testc_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testc_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testc_pd(a, b); - #else - uint_fast64_t r = 0; - simde__m256d_private - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= ~a_.u64[i] & b_.u64[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testc_pd - #define _mm256_testc_pd(a, b) simde_mm256_testc_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testc_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testc_si256(a, b); - #else - int_fast32_t r = 0; - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r |= ~a_.i32f[i] & b_.i32f[i]; - } - - return HEDLEY_STATIC_CAST(int, !r); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testc_si256 - #define _mm256_testc_si256(a, b) simde_mm256_testc_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testz_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testz_ps(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u32x4_shr(wasm_v128_not(wasm_v128_and(a_.wasm_v128, b_.wasm_v128)), 31); - m = wasm_v128_and(m, simde_mm_movehl_ps(m, m)); - m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); - return wasm_i32x4_extract_lane(m, 0); - #else - uint_fast32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= a_.u32[i] & b_.u32[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testz_ps - #define _mm_testz_ps(a, b) simde_mm_testz_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testz_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testz_pd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u64x2_shr(wasm_v128_not(wasm_v128_and(a_.wasm_v128, b_.wasm_v128)), 63); - return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1)); - #else - uint_fast64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= a_.u64[i] & b_.u64[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testz_pd - #define _mm_testz_pd(a, b) simde_mm_testz_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testz_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testz_ps(a, b); - #else - uint_fast32_t r = 0; - simde__m256_private - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= a_.u32[i] & b_.u32[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testz_ps - #define _mm256_testz_ps(a, b) simde_mm256_testz_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testz_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testz_pd(a, b); - #else - uint_fast64_t r = 0; - simde__m256d_private - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= a_.u64[i] & b_.u64[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testz_pd - #define _mm256_testz_pd(a, b) simde_mm256_testz_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testz_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testz_si256(a, b); - #else - int_fast32_t r = 0; - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r = simde_mm_testz_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testz_si128(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r |= a_.i32f[i] & b_.i32f[i]; - } - - r = !r; - #endif - - return HEDLEY_STATIC_CAST(int, r); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testz_si256 - #define _mm256_testz_si256(a, b) simde_mm256_testz_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testnzc_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testnzc_ps(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u32x4_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 31); - v128_t m2 = wasm_u32x4_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 31); - m = wasm_v128_or(m, simde_mm_movehl_ps(m, m)); - m2 = wasm_v128_or(m2, simde_mm_movehl_ps(m2, m2)); - m = wasm_v128_or(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); - m2 = wasm_v128_or(m2, simde_mm_shuffle_epi32(m2, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); - return wasm_i32x4_extract_lane(m, 0) & wasm_i32x4_extract_lane(m2, 0); - #else - uint32_t rz = 0, rc = 0; - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - rc |= ~a_.u32[i] & b_.u32[i]; - rz |= a_.u32[i] & b_.u32[i]; - } - - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testnzc_ps - #define _mm_testnzc_ps(a, b) simde_mm_testnzc_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testnzc_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testnzc_pd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u64x2_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 63); - v128_t m2 = wasm_u64x2_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 63); - return HEDLEY_STATIC_CAST(int, (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) - & (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1))); - #else - uint64_t rc = 0, rz = 0; - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - rc |= ~a_.u64[i] & b_.u64[i]; - rz |= a_.u64[i] & b_.u64[i]; - } - - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testnzc_pd - #define _mm_testnzc_pd(a, b) simde_mm_testnzc_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testnzc_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testnzc_ps(a, b); - #else - uint32_t rc = 0, rz = 0; - simde__m256_private - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - rc |= ~a_.u32[i] & b_.u32[i]; - rz |= a_.u32[i] & b_.u32[i]; - } - - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testnzc_ps - #define _mm256_testnzc_ps(a, b) simde_mm256_testnzc_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testnzc_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testnzc_pd(a, b); - #else - uint64_t rc = 0, rz = 0; - simde__m256d_private - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - rc |= ~a_.u64[i] & b_.u64[i]; - rz |= a_.u64[i] & b_.u64[i]; - } - - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testnzc_pd - #define _mm256_testnzc_pd(a, b) simde_mm256_testnzc_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testnzc_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testnzc_si256(a, b); - #else - int32_t rc = 0, rz = 0; - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - rc |= ~a_.i32f[i] & b_.i32f[i]; - rz |= a_.i32f[i] & b_.i32f[i]; - } - - return !!(rc & rz); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testnzc_si256 - #define _mm256_testnzc_si256(a, b) simde_mm256_testnzc_si256(a, b) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX_H) */ diff --git a/extern/simde/x86/avx2.h b/extern/simde/x86/avx2.h deleted file mode 100644 index a8d3808b5..000000000 --- a/extern/simde/x86/avx2.h +++ /dev/null @@ -1,5758 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2018-2020 Evan Nemerson - * 2019-2020 Michael R. Crusoe - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX2_H) -#define SIMDE_X86_AVX2_H - -#include "avx.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_abs_epi8 (simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_abs_epi8(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_abs_epi8(a_.m128i[0]); - r_.m128i[1] = simde_mm_abs_epi8(a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_abs_epi8 - #define _mm256_abs_epi8(a) simde_mm256_abs_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_abs_epi16 (simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_abs_epi16(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_abs_epi16(a_.m128i[0]); - r_.m128i[1] = simde_mm_abs_epi16(a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_abs_epi16 - #define _mm256_abs_epi16(a) simde_mm256_abs_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_abs_epi32(simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_abs_epi32(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_abs_epi32(a_.m128i[0]); - r_.m128i[1] = simde_mm_abs_epi32(a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { - r_.i32[i] = (a_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_abs_epi32 - #define _mm256_abs_epi32(a) simde_mm256_abs_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_add_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_add_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_add_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 + b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] + b_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_epi8 - #define _mm256_add_epi8(a, b) simde_mm256_add_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_add_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_add_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_add_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 + b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] + b_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_epi16 - #define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hadd_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hadd_epi16(a, b); - #else - return simde_mm256_add_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadd_epi16 - #define _mm256_hadd_epi16(a, b) simde_mm256_hadd_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_add_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_add_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_add_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 + b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] + b_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_epi32 - #define _mm256_add_epi32(a, b) simde_mm256_add_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hadd_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hadd_epi32(a, b); - #else - return simde_mm256_add_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadd_epi32 - #define _mm256_hadd_epi32(a, b) simde_mm256_hadd_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_add_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_add_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_add_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.i64 = a_.i64 + b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] + b_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_epi64 - #define _mm256_add_epi64(a, b) simde_mm256_add_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - if (HEDLEY_UNLIKELY(count > 31)) - return simde_mm256_setzero_si256(); - - for (size_t h = 0 ; h < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; h++) { - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) { - const int srcpos = count + HEDLEY_STATIC_CAST(int, i); - if (srcpos > 31) { - r_.m128i_private[h].i8[i] = 0; - } else if (srcpos > 15) { - r_.m128i_private[h].i8[i] = a_.m128i_private[h].i8[(srcpos) & 15]; - } else { - r_.m128i_private[h].i8[i] = b_.m128i_private[h].i8[srcpos]; - } - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_BUG_PGI_30106) -# define simde_mm256_alignr_epi8(a, b, count) _mm256_alignr_epi8(a, b, count) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_alignr_epi8(a, b, count) \ - simde_mm256_set_m128i( \ - simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \ - simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_alignr_epi8 - #define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_and_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_and_si256(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_and_si128(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_and_si128(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] & b_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_and_si256 - #define _mm256_and_si256(a, b) simde_mm256_and_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_andnot_si256(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_andnot_si128(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_andnot_si128(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_andnot_si256 - #define _mm256_andnot_si256(a, b) simde_mm256_andnot_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_adds_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_adds_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_adds_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_adds_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_adds_epi8 - #define _mm256_adds_epi8(a, b) simde_mm256_adds_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_adds_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_adds_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_adds_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_adds_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_adds_epi16 - #define _mm256_adds_epi16(a, b) simde_mm256_adds_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hadds_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hadds_epi16(a, b); - #else - return simde_mm256_adds_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadds_epi16 - #define _mm256_hadds_epi16(a, b) simde_mm256_hadds_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_adds_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_adds_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_adds_epu8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_adds_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_adds_epu8 - #define _mm256_adds_epu8(a, b) simde_mm256_adds_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_adds_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_adds_epu16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_adds_epu16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_adds_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_adds_epu16 - #define _mm256_adds_epu16(a, b) simde_mm256_adds_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_avg_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_avg_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_avg_epu8 - #define _mm256_avg_epu8(a, b) simde_mm256_avg_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_avg_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_avg_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_avg_epu16 - #define _mm256_avg_epu16(a, b) simde_mm256_avg_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8) -#elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128) -# define simde_mm_blend_epi32(a, b, imm8) \ - simde_mm_castps_si128(simde_mm_blend_ps(simde_mm_castsi128_ps(a), simde_mm_castsi128_ps(b), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_blend_epi32 - #define _mm_blend_epi32(a, b, imm8) simde_mm_blend_epi32(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((imm8 >> i%8) & 1) ? b_.i16[i] : a_.i16[i]; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_BUG_CLANG_REV_234560) -# define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8)) -#elif defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_blend_epi16(a, b, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8)), \ - simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_blend_epi16 - #define _mm256_blend_epi16(a, b, imm8) simde_mm256_blend_epi16(a, b, imm8) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i]; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_blend_epi32(a, b, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8) >> 4), \ - simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8) & 0x0F)) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_blend_epi32 - #define _mm256_blend_epi32(a, b, imm8) simde_mm256_blend_epi32(a, b, imm8) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_blendv_epi8(a, b, mask); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - mask_ = simde__m256i_to_private(mask); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_blendv_epi8(a_.m128i[0], b_.m128i[0], mask_.m128i[0]); - r_.m128i[1] = simde_mm_blendv_epi8(a_.m128i[1], b_.m128i[1], mask_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - __typeof__(mask_.i8) tmp = mask_.i8 >> 7; - r_.i8 = (tmp & b_.i8) | (~tmp & a_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - int8_t tmp = mask_.i8[i] >> 7; - r_.i8[i] = (tmp & b_.i8[i]) | (~tmp & a_.i8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_blendv_epi8(a, b, imm8) _mm256_blendv_epi8(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_blendv_epi8 - #define _mm256_blendv_epi8(a, b, mask) simde_mm256_blendv_epi8(a, b, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_broadcastb_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastb_epi8(a); - #else - simde__m128i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[0]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastb_epi8 - #define _mm_broadcastb_epi8(a) simde_mm_broadcastb_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastb_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastb_epi8(a); - #else - simde__m256i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[0]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastb_epi8 - #define _mm256_broadcastb_epi8(a) simde_mm256_broadcastb_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_broadcastw_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastw_epi16(a); - #else - simde__m128i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[0]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastw_epi16 - #define _mm_broadcastw_epi16(a) simde_mm_broadcastw_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastw_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastw_epi16(a); - #else - simde__m256i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[0]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastw_epi16 - #define _mm256_broadcastw_epi16(a) simde_mm256_broadcastw_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_broadcastd_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastd_epi32(a); - #else - simde__m128i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[0]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastd_epi32 - #define _mm_broadcastd_epi32(a) simde_mm_broadcastd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastd_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastd_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[0]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastd_epi32 - #define _mm256_broadcastd_epi32(a) simde_mm256_broadcastd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_broadcastq_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastq_epi64(a); - #else - simde__m128i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[0]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastq_epi64 - #define _mm_broadcastq_epi64(a) simde_mm_broadcastq_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastq_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastq_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[0]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastq_epi64 - #define _mm256_broadcastq_epi64(a) simde_mm256_broadcastq_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_broadcastss_ps (simde__m128 a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastss_ps(a); - #elif defined(SIMDE_X86_SSE_NATIVE) - return simde_mm_shuffle_ps(a, a, 0); - #else - simde__m128_private r_; - simde__m128_private a_= simde__m128_to_private(a); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[0]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastss_ps - #define _mm_broadcastss_ps(a) simde_mm_broadcastss_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_broadcastss_ps (simde__m128 a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastss_ps(a); - #else - simde__m256_private r_; - simde__m128_private a_= simde__m128_to_private(a); - - #if defined(SIMDE_X86_AVX_NATIVE) - __m128 tmp = _mm_permute_ps(a_.n, 0); - r_.n = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 1); - #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 0, 0, 0, 0, 0, 0, 0); - #elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128) - r_.m128[0] = r_.m128[1] = simde_mm_broadcastss_ps(simde__m128_from_private(a_)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[0]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastss_ps - #define _mm256_broadcastss_ps(a) simde_mm256_broadcastss_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_broadcastsd_pd (simde__m128d a) { - return simde_mm_movedup_pd(a); -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastsd_pd - #define _mm_broadcastsd_pd(a) simde_mm_broadcastsd_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_broadcastsd_pd (simde__m128d a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastsd_pd(a); - #else - simde__m256d_private r_; - simde__m128d_private a_= simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[0]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastsd_pd - #define _mm256_broadcastsd_pd(a) simde_mm256_broadcastsd_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastsi128_si256 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) - return _mm256_broadcastsi128_si256(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i_private[0] = a_; - r_.m128i_private[1] = a_; - #else - r_.i64[0] = a_.i64[0]; - r_.i64[1] = a_.i64[1]; - r_.i64[2] = a_.i64[0]; - r_.i64[3] = a_.i64[1]; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#define simde_mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a) -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastsi128_si256 - #define _mm256_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a) - #undef _mm_broadcastsi128_si256 - #define _mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_bslli_epi128 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0]))); - - SIMDE_VECTORIZE - for (int i = 0 ; i < ssize ; i++) { - const int e = i - imm8; - if(i >= (ssize/2)) { - if(e >= (ssize/2) && e < ssize) - r_.i8[i] = a_.i8[e]; - else - r_.i8[i] = 0; - } - else{ - if(e >= 0 && e < (ssize/2)) - r_.i8[i] = a_.i8[e]; - else - r_.i8[i] = 0; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) - #define simde_mm256_bslli_epi128(a, imm8) _mm256_bslli_epi128(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_bslli_epi128 - #define _mm256_bslli_epi128(a, imm8) simde_mm256_bslli_epi128(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_bsrli_epi128 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0]))); - - SIMDE_VECTORIZE - for (int i = 0 ; i < ssize ; i++) { - const int e = i + imm8; - if(i < (ssize/2)) { - if(e >= 0 && e < (ssize/2)) - r_.i8[i] = a_.i8[e]; - else - r_.i8[i] = 0; - } - else{ - if(e >= (ssize/2) && e < ssize) - r_.i8[i] = a_.i8[e]; - else - r_.i8[i] = 0; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) - #define simde_mm256_bsrli_epi128(a, imm8) _mm256_bsrli_epi128(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_bsrli_epi128 - #define _mm256_bsrli_epi128(a, imm8) simde_mm256_bsrli_epi128(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpeq_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpeq_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpeq_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpeq_epi8 - #define _mm256_cmpeq_epi8(a, b) simde_mm256_cmpeq_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpeq_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpeq_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpeq_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpeq_epi16 - #define _mm256_cmpeq_epi16(a, b) simde_mm256_cmpeq_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpeq_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpeq_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpeq_epi32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpeq_epi32 - #define _mm256_cmpeq_epi32(a, b) simde_mm256_cmpeq_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpeq_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpeq_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpeq_epi64(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpeq_epi64 - #define _mm256_cmpeq_epi64(a, b) simde_mm256_cmpeq_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpgt_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpgt_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpgt_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 > b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpgt_epi8 - #define _mm256_cmpgt_epi8(a, b) simde_mm256_cmpgt_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpgt_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpgt_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpgt_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 > b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpgt_epi16 - #define _mm256_cmpgt_epi16(a, b) simde_mm256_cmpgt_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpgt_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpgt_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpgt_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 > b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpgt_epi32 - #define _mm256_cmpgt_epi32(a, b) simde_mm256_cmpgt_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpgt_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpgt_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpgt_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpgt_epi64 - #define _mm256_cmpgt_epi64(a, b) simde_mm256_cmpgt_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi8_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi8_epi16(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi8_epi16 - #define _mm256_cvtepi8_epi16(a) simde_mm256_cvtepi8_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi8_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi8_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi8_epi32 - #define _mm256_cvtepi8_epi32(a) simde_mm256_cvtepi8_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi8_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi8_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i8[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi8_epi64 - #define _mm256_cvtepi8_epi64(a) simde_mm256_cvtepi8_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi16_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi16_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi16_epi32 - #define _mm256_cvtepi16_epi32(a) simde_mm256_cvtepi16_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi16_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi16_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi16_epi64 - #define _mm256_cvtepi16_epi64(a) simde_mm256_cvtepi16_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi32_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi32_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi32_epi64 - #define _mm256_cvtepi32_epi64(a) simde_mm256_cvtepi32_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu8_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu8_epi16(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i16, a_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.u8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu8_epi16 - #define _mm256_cvtepu8_epi16(a) simde_mm256_cvtepu8_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu8_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu8_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.u8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu8_epi32 - #define _mm256_cvtepu8_epi32(a) simde_mm256_cvtepu8_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu8_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu8_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u8[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu8_epi64 - #define _mm256_cvtepu8_epi64(a) simde_mm256_cvtepu8_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu16_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu16_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.u16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu16_epi32 - #define _mm256_cvtepu16_epi32(a) simde_mm256_cvtepu16_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu16_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu16_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu16_epi64 - #define _mm256_cvtepu16_epi64(a) simde_mm256_cvtepu16_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu32_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu32_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu32_epi64 - #define _mm256_cvtepu32_epi64(a) simde_mm256_cvtepu32_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_extract_epi8 (simde__m256i a, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 31){ - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i8[index]; -} -#if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_extract_epi8(a, index) _mm256_extract_epi8(a, index) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_extract_epi8 - #define _mm256_extract_epi8(a, index) simde_mm256_extract_epi8(a, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_extract_epi16 (simde__m256i a, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 15) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i16[index]; -} -#if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_extract_epi16(a, index) _mm256_extract_epi16(a, index) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_extract_epi16 - #define _mm256_extract_epi16(a, index) simde_mm256_extract_epi16(a, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_extracti128_si256 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.m128i[imm8]; -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_extracti128_si256(a, imm8) _mm256_extracti128_si256(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_extracti128_si256 - #define _mm256_extracti128_si256(a, imm8) simde_mm256_extracti128_si256(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_i32gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i32[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i32gather_epi32(base_addr, vindex, scale) _mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i32gather_epi32 - #define _mm_i32gather_epi32(base_addr, vindex, scale) simde_mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_i32gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i32[i] = dst; - } - else { - r_.i32[i] = src_.i32[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i32gather_epi32 - #define _mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_i32gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i32[i] = dst; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i32gather_epi32(base_addr, vindex, scale) _mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i32gather_epi32 - #define _mm256_i32gather_epi32(base_addr, vindex, scale) simde_mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_i32gather_epi32(simde__m256i src, const int32_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex), - src_ = simde__m256i_to_private(src), - mask_ = simde__m256i_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i32[i] = dst; - } - else { - r_.i32[i] = src_.i32[i]; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i32gather_epi32 - #define _mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_i64gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i32[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i64gather_epi32(base_addr, vindex, scale) _mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i64gather_epi32 - #define _mm_i64gather_epi32(base_addr, vindex, scale) simde_mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i32[i] = dst; - } - else { - r_.i32[i] = src_.i32[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i64gather_epi32 - #define _mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_i64gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m128i_private - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i32[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i64gather_epi32(base_addr, vindex, scale) _mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i64gather_epi32 - #define _mm256_i64gather_epi32(base_addr, vindex, scale) simde_mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m256i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m128i_private - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i32[i] = dst; - } - else { - r_.i32[i] = src_.i32[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i64gather_epi32 - #define _mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i64[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) - #else - #define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i32gather_epi64 - #define _mm_i32gather_epi64(base_addr, vindex, scale) simde_mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_i32gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i64[i] = dst; - } - else { - r_.i64[i] = src_.i64[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) - #else - #define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i32gather_epi64 - #define _mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m256i_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i64[i] = dst; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) - #else - #define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i32gather_epi64 - #define _mm256_i32gather_epi64(base_addr, vindex, scale) simde_mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_i32gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m128i vindex, simde__m256i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - src_ = simde__m256i_to_private(src), - mask_ = simde__m256i_to_private(mask), - r_; - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i64[i] = dst; - } - else { - r_.i64[i] = src_.i64[i]; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) - #else - #define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i32gather_epi64 - #define _mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_i64gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i64[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) - #else - #define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i64gather_epi64 - #define _mm_i64gather_epi64(base_addr, vindex, scale) simde_mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_i64gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i64[i] = dst; - } - else { - r_.i64[i] = src_.i64[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) - #else - #define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i64gather_epi64 - #define _mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_i64gather_epi64(const int64_t* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex), - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i64[i] = dst; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) - #else - #define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i64gather_epi64 - #define _mm256_i64gather_epi64(base_addr, vindex, scale) simde_mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_i64gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex), - src_ = simde__m256i_to_private(src), - mask_ = simde__m256i_to_private(mask), - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i64[i] = dst; - } - else { - r_.i64[i] = src_.i64[i]; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) - #else - #define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i64gather_epi64 - #define _mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_i32gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i32gather_ps(base_addr, vindex, scale) _mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i32gather_ps - #define _mm_i32gather_ps(base_addr, vindex, scale) simde_mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_i32gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128_private - src_ = simde__m128_to_private(src), - mask_ = simde__m128_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f32[i] = dst; - } - else { - r_.f32[i] = src_.f32[i]; - } - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i32gather_ps - #define _mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_i32gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m256_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, (base_addr)), (vindex), (scale)) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i32gather_ps - #define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, (base_addr)), (vindex), (scale)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_i32gather_ps(simde__m256 src, const simde_float32* base_addr, simde__m256i vindex, simde__m256 mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m256_private - src_ = simde__m256_to_private(src), - mask_ = simde__m256_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f32[i] = dst; - } - else { - r_.f32[i] = src_.f32[i]; - } - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i32gather_ps - #define _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_i64gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128_private - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i64gather_ps(base_addr, vindex, scale) _mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i64gather_ps - #define _mm_i64gather_ps(base_addr, vindex, scale) simde_mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128_private - src_ = simde__m128_to_private(src), - mask_ = simde__m128_to_private(mask), - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f32[i] = dst; - } - else { - r_.f32[i] = src_.f32[i]; - } - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, float32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i64gather_ps - #define _mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_i64gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m128_private - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i64gather_ps(base_addr, vindex, scale) _mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i64gather_ps - #define _mm256_i64gather_ps(base_addr, vindex, scale) simde_mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m256i vindex, simde__m128 mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m128_private - src_ = simde__m128_to_private(src), - mask_ = simde__m128_to_private(mask), - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f32[i] = dst; - } - else { - r_.f32[i] = src_.f32[i]; - } - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i64gather_ps - #define _mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128d_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f64[i] = dst; - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i32gather_pd(base_addr, vindex, scale) _mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i32gather_pd - #define _mm_i32gather_pd(base_addr, vindex, scale) simde_mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_i32gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128d_private - src_ = simde__m128d_to_private(src), - mask_ = simde__m128d_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f64[i] = dst; - } - else { - r_.f64[i] = src_.f64[i]; - } - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i32gather_pd - #define _mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m256d_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f64[i] = dst; - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i32gather_pd(base_addr, vindex, scale) _mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i32gather_pd - #define _mm256_i32gather_pd(base_addr, vindex, scale) simde_mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_i32gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m128i vindex, simde__m256d mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256d_private - src_ = simde__m256d_to_private(src), - mask_ = simde__m256d_to_private(mask), - r_; - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f64[i] = dst; - } - else { - r_.f64[i] = src_.f64[i]; - } - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i32gather_pd - #define _mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_i64gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128d_private - r_ = simde__m128d_to_private(simde_mm_setzero_pd()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f64[i] = dst; - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i64gather_pd(base_addr, vindex, scale) _mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i64gather_pd - #define _mm_i64gather_pd(base_addr, vindex, scale) simde_mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_i64gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128d_private - src_ = simde__m128d_to_private(src), - mask_ = simde__m128d_to_private(mask), - r_ = simde__m128d_to_private(simde_mm_setzero_pd()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f64[i] = dst; - } - else { - r_.f64[i] = src_.f64[i]; - } - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i64gather_pd - #define _mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_i64gather_pd(const simde_float64* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m256d_private - r_ = simde__m256d_to_private(simde_mm256_setzero_pd()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f64[i] = dst; - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i64gather_pd(base_addr, vindex, scale) _mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i64gather_pd - #define _mm256_i64gather_pd(base_addr, vindex, scale) simde_mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_i64gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m256i vindex, simde__m256d mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m256d_private - src_ = simde__m256d_to_private(src), - mask_ = simde__m256d_to_private(mask), - r_ = simde__m256d_to_private(simde_mm256_setzero_pd()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f64[i] = dst; - } - else { - r_.f64[i] = src_.f64[i]; - } - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i64gather_pd - #define _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_inserti128_si256(simde__m256i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256i_private a_ = simde__m256i_to_private(a); - simde__m128i_private b_ = simde__m128i_to_private(b); - - a_.m128i_private[ imm8 & 1 ] = b_; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_inserti128_si256(a, b, imm8) _mm256_inserti128_si256(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_inserti128_si256 - #define _mm256_inserti128_si256(a, b, imm8) simde_mm256_inserti128_si256(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_madd_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_madd_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_madd_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_madd_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - SIMDE_ALIGN_TO_32 int32_t product SIMDE_VECTOR(64); - SIMDE_ALIGN_TO_32 int32_t a32x16 SIMDE_VECTOR(64); - SIMDE_ALIGN_TO_32 int32_t b32x16 SIMDE_VECTOR(64); - SIMDE_ALIGN_TO_32 int32_t even SIMDE_VECTOR(32); - SIMDE_ALIGN_TO_32 int32_t odd SIMDE_VECTOR(32); - - SIMDE_CONVERT_VECTOR_(a32x16, a_.i16); - SIMDE_CONVERT_VECTOR_(b32x16, b_.i16); - product = a32x16 * b32x16; - - even = __builtin_shufflevector(product, product, 0, 2, 4, 6, 8, 10, 12, 14); - odd = __builtin_shufflevector(product, product, 1, 3, 5, 7, 9, 11, 13, 15); - - r_.i32 = even + odd; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) { - r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_madd_epi16 - #define _mm256_madd_epi16(a, b) simde_mm256_madd_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maddubs_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_maddubs_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_maddubs_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_maddubs_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const int idx = HEDLEY_STATIC_CAST(int, i) << 1; - int32_t ts = - (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) + - (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1])); - r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maddubs_epi16 - #define _mm256_maddubs_epi16(a, b) simde_mm256_maddubs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_maskload_epi32(mem_addr, mask); - #else - simde__m128i_private - r_, - mask_ = simde__m128i_to_private(mask), - mask_shr_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_shr_.neon_i32 = vshrq_n_s32(mask_.neon_i32, 31); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - mask_shr_.i32[i] = mask_.i32[i] >> 31; - } - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = mask_shr_.i32[i] ? mem_addr[i] : INT32_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_maskload_epi32 - #define _mm_maskload_epi32(mem_addr, mask) simde_mm_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_maskload_epi32(mem_addr, mask); - #else - simde__m256i_private - mask_ = simde__m256i_to_private(mask), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (mask_.i32[i] >> 31) ? mem_addr[i] : INT32_C(0); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskload_epi32 - #define _mm256_maskload_epi32(mem_addr, mask) simde_mm256_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask); - #else - simde__m128i_private - r_, - mask_ = simde__m128i_to_private(mask), - mask_shr_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_shr_.neon_i64 = vshrq_n_s64(mask_.neon_i64, 63); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(mask_.i64) / sizeof(mask_.i64[0])) ; i++) { - mask_shr_.i64[i] = mask_.i64[i] >> 63; - } - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = mask_shr_.i64[i] ? mem_addr[i] : INT64_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_maskload_epi64 - #define _mm_maskload_epi64(mem_addr, mask) simde_mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask); - #else - simde__m256i_private - mask_ = simde__m256i_to_private(mask), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (mask_.i64[i] >> 63) ? mem_addr[i] : INT64_C(0); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskload_epi64 - #define _mm256_maskload_epi64(mem_addr, mask) simde_mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask, simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - _mm_maskstore_epi32(mem_addr, mask, a); - #else - simde__m128i_private mask_ = simde__m128i_to_private(mask); - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.i32[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_maskstore_epi32 - #define _mm_maskstore_epi32(mem_addr, mask, a) simde_mm_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask, simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - _mm256_maskstore_epi32(mem_addr, mask, a); - #else - simde__m256i_private mask_ = simde__m256i_to_private(mask); - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.i32[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskstore_epi32 - #define _mm256_maskstore_epi32(mem_addr, mask, a) simde_mm256_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask, simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - _mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a); - #else - simde__m128i_private mask_ = simde__m128i_to_private(mask); - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if (mask_.u64[i] >> 63) - mem_addr[i] = a_.i64[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_maskstore_epi64 - #define _mm_maskstore_epi64(mem_addr, mask, a) simde_mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask, simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - _mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a); - #else - simde__m256i_private mask_ = simde__m256i_to_private(mask); - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if (mask_.u64[i] & (UINT64_C(1) << 63)) - mem_addr[i] = a_.i64[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskstore_epi64 - #define _mm256_maskstore_epi64(mem_addr, mask, a) simde_mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI) - return _mm256_max_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epi8 - #define _mm256_max_epi8(a, b) simde_mm256_max_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epu8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epu8 - #define _mm256_max_epu8(a, b) simde_mm256_max_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epu16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epu16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] > b_.u16[i]) ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epu16 - #define _mm256_max_epu16(a, b) simde_mm256_max_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epu32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epu32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epu32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] > b_.u32[i]) ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epu32 - #define _mm256_max_epu32(a, b) simde_mm256_max_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epi16 - #define _mm256_max_epi16(a, b) simde_mm256_max_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epi32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epi32 - #define _mm256_max_epi32(a, b) simde_mm256_max_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI) - return _mm256_min_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epi8 - #define _mm256_min_epi8(a, b) simde_mm256_min_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epi16 - #define _mm256_min_epi16(a, b) simde_mm256_min_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epi32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epi32 - #define _mm256_min_epi32(a, b) simde_mm256_min_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epu8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epu8 - #define _mm256_min_epu8(a, b) simde_mm256_min_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epu16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epu16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epu16 - #define _mm256_min_epu16(a, b) simde_mm256_min_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epu32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epu32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epu32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epu32 - #define _mm256_min_epu32(a, b) simde_mm256_min_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm256_movemask_epi8 (simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_movemask_epi8(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - uint32_t r = 0; - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(uint32_t,simde_mm_movemask_epi8(a_.m128i[i])) << (16 * i); - } - #else - r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(uint32_t, (a_.u8[31 - i] >> 7)) << (31 - i); - } - #endif - - return HEDLEY_STATIC_CAST(int32_t, r); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_movemask_epi8 - #define _mm256_movemask_epi8(a) simde_mm256_movemask_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mpsadbw_epu8 (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - const int a_offset1 = imm8 & 4; - const int b_offset1 = (imm8 & 3) << 2; - const int a_offset2 = (imm8 >> 3) & 4; - const int b_offset2 = ((imm8 >> 3) & 3) << 2; - - #if defined(simde_math_abs) - const int halfway_point = HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0])) ) / 2; - for (int i = 0 ; i < halfway_point ; i++) { - r_.u16[i] = - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 0] - b_.u8[b_offset1 + 0]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 1] - b_.u8[b_offset1 + 1]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 2] - b_.u8[b_offset1 + 2]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 3] - b_.u8[b_offset1 + 3]))); - r_.u16[halfway_point + i] = - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 0] - b_.u8[2 * halfway_point + b_offset2 + 0]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 1] - b_.u8[2 * halfway_point + b_offset2 + 1]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 2] - b_.u8[2 * halfway_point + b_offset2 + 2]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 3] - b_.u8[2 * halfway_point + b_offset2 + 3]))); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) - #define simde_mm256_mpsadbw_epu8(a, b, imm8) _mm256_mpsadbw_epu8(a, b, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - #define simde_mm256_mpsadbw_epu8(a, b, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8 >> 3)), \ - simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mpsadbw_epu8 - #define _mm256_mpsadbw_epu8(a, b, imm8) simde_mm256_mpsadbw_epu8(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mul_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mul_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_mul_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_mul_epi32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = - HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) * - HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mul_epi32(a, b) simde_mm256_mul_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mul_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mul_epu32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_mul_epu32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_mul_epu32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mul_epu32(a, b) simde_mm256_mul_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mulhi_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mulhi_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16)); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mulhi_epi16(a, b) simde_mm256_mulhi_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mulhi_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mulhi_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mulhi_epu16(a, b) simde_mm256_mulhi_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mulhrs_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mulhrs_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15)); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mulhrs_epi16(a, b) simde_mm256_mulhrs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mullo_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mullo_epi16(a, b); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] * b_.i16[i]); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mullo_epi16 - #define _mm256_mullo_epi16(a, b) simde_mm256_mullo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mullo_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mullo_epi32(a, b); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] * b_.i32[i]); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mullo_epi32 - #define _mm256_mullo_epi32(a, b) simde_mm256_mullo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_mullo_epu32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 * b_.u32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] * b_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_or_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_or_si256(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_or_si128(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_or_si128(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_or_si256 - #define _mm256_or_si256(a, b) simde_mm256_or_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_packs_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_packs_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_packs_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_packs_epi16(a_.m128i[1], b_.m128i[1]); - #else - const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/2; - const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/4; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i8[i] = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i])); - r_.i8[i + quarter_point] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i])); - r_.i8[halfway_point + i] = (a_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((a_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[quarter_point + i])); - r_.i8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((b_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[quarter_point + i])); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_packs_epi16 - #define _mm256_packs_epi16(a, b) simde_mm256_packs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_packs_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_packs_epi32(a, b); - #else - simde__m256i_private - r_, - v_[] = { - simde__m256i_to_private(a), - simde__m256i_to_private(b) - }; - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_packs_epi32(v_[0].m128i[0], v_[1].m128i[0]); - r_.m128i[1] = simde_mm_packs_epi32(v_[0].m128i[1], v_[1].m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const int32_t v = v_[(i >> 2) & 1].i32[(i & 11) - ((i & 8) >> 1)]; - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (v > INT16_MAX) ? INT16_MAX : ((v < INT16_MIN) ? INT16_MIN : v)); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_packs_epi32 - #define _mm256_packs_epi32(a, b) simde_mm256_packs_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_packus_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_packus_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_packus_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_packus_epi16(a_.m128i[1], b_.m128i[1]); - #else - const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 2; - const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 4; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i])); - r_.u8[i + quarter_point] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i])); - r_.u8[halfway_point + i] = (a_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[quarter_point + i])); - r_.u8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[quarter_point + i])); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_packus_epi16 - #define _mm256_packus_epi16(a, b) simde_mm256_packus_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_packus_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_packus_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_packus_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_packus_epi32(a_.m128i[1], b_.m128i[1]); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.u16[i] = (a_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i])); - r_.u16[i + quarter_point] = (b_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i])); - r_.u16[halfway_point + i] = (a_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[quarter_point + i])); - r_.u16[halfway_point + i + quarter_point] = (b_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[quarter_point + i])); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_packus_epi32 - #define _mm256_packus_epi32(a, b) simde_mm256_packus_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permute2x128_si256 (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8 ) & 1] : a_.m128i_private[(imm8 ) & 1]); - r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]); - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_permute2x128_si256(a, b, imm8) _mm256_permute2x128_si256(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute2x128_si256 - #define _mm256_permute2x128_si256(a, b, imm8) simde_mm256_permute2x128_si256(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permute4x64_epi64 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - r_.i64[0] = (imm8 & 0x02) ? a_.i64[((imm8 ) & 1)+2] : a_.i64[(imm8 ) & 1]; - r_.i64[1] = (imm8 & 0x08) ? a_.i64[((imm8 >> 2 ) & 1)+2] : a_.i64[(imm8 >> 2 ) & 1]; - r_.i64[2] = (imm8 & 0x20) ? a_.i64[((imm8 >> 4 ) & 1)+2] : a_.i64[(imm8 >> 4 ) & 1]; - r_.i64[3] = (imm8 & 0x80) ? a_.i64[((imm8 >> 6 ) & 1)+2] : a_.i64[(imm8 >> 6 ) & 1]; - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_permute4x64_epi64(a, imm8) _mm256_permute4x64_epi64(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute4x64_epi64 - #define _mm256_permute4x64_epi64(a, imm8) simde_mm256_permute4x64_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permute4x64_pd (simde__m256d a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - r_.f64[0] = (imm8 & 0x02) ? a_.f64[((imm8 ) & 1)+2] : a_.f64[(imm8 ) & 1]; - r_.f64[1] = (imm8 & 0x08) ? a_.f64[((imm8 >> 2 ) & 1)+2] : a_.f64[(imm8 >> 2 ) & 1]; - r_.f64[2] = (imm8 & 0x20) ? a_.f64[((imm8 >> 4 ) & 1)+2] : a_.f64[(imm8 >> 4 ) & 1]; - r_.f64[3] = (imm8 & 0x80) ? a_.f64[((imm8 >> 6 ) & 1)+2] : a_.f64[(imm8 >> 6 ) & 1]; - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_permute4x64_pd(a, imm8) _mm256_permute4x64_pd(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute4x64_pd - #define _mm256_permute4x64_pd(a, imm8) simde_mm256_permute4x64_pd(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutevar8x32_epi32 (simde__m256i a, simde__m256i idx) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_permutevar8x32_epi32(a, idx); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - idx_ = simde__m256i_to_private(idx); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[idx_.i32[i] & 7]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutevar8x32_epi32 - #define _mm256_permutevar8x32_epi32(a, idx) simde_mm256_permutevar8x32_epi32(a, idx) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permutevar8x32_ps (simde__m256 a, simde__m256i idx) { - #if defined(SIMDE_X86_AVX2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm256_permutevar8x32_ps(a, HEDLEY_REINTERPRET_CAST(simde__m256, idx)); - #else - return _mm256_permutevar8x32_ps(a, idx); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - simde__m256i_private - idx_ = simde__m256i_to_private(idx); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[idx_.i32[i] & 7]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutevar8x32_ps - #define _mm256_permutevar8x32_ps(a, idx) simde_mm256_permutevar8x32_ps(a, idx) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sad_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sad_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sad_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sad_epu8(a_.m128i[1], b_.m128i[1]); - #else - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - uint16_t tmp = 0; - SIMDE_VECTORIZE_REDUCTION(+:tmp) - for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 4) ; j++) { - const size_t e = j + (i * 8); - tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]); - } - r_.i64[i] = tmp; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sad_epu8 - #define _mm256_sad_epu8(a, b) simde_mm256_sad_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_shuffle_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_shuffle_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_shuffle_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_shuffle_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; i++) { - r_.u8[ i ] = (b_.u8[ i ] & 0x80) ? 0 : a_.u8[(b_.u8[ i ] & 0x0f) ]; - r_.u8[i + 16] = (b_.u8[i + 16] & 0x80) ? 0 : a_.u8[(b_.u8[i + 16] & 0x0f) + 16]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_shuffle_epi8 - #define _mm256_shuffle_epi8(a, b) simde_mm256_shuffle_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) { - r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3]; - } - for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) { - r_.i32[i + 4] = a_.i32[((imm8 >> (i * 2)) & 3) + 4]; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_shuffle_epi32(a, imm8) _mm256_shuffle_epi32(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) -# define simde_mm256_shuffle_epi32(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) -# define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ - simde__m256i_from_private((simde__m256i_private) { .i32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 32, \ - (simde_tmp_a_).i32, \ - (simde_tmp_a_).i32, \ - ((imm8) ) & 3, \ - ((imm8) >> 2) & 3, \ - ((imm8) >> 4) & 3, \ - ((imm8) >> 6) & 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_shuffle_epi32 - #define _mm256_shuffle_epi32(a, imm8) simde_mm256_shuffle_epi32(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_shufflehi_epi16(a, imm8) _mm256_shufflehi_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_shufflehi_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) -# define simde_mm256_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ - simde__m256i_from_private((simde__m256i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 32, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - 0, 1, 2, 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4, \ - 8, 9, 10, 11, \ - ((((imm8) ) & 3) + 8 + 4), \ - ((((imm8) >> 2) & 3) + 8 + 4), \ - ((((imm8) >> 4) & 3) + 8 + 4), \ - ((((imm8) >> 6) & 3) + 8 + 4) \ - ) }); })) -#else -# define simde_mm256_shufflehi_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \ - simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), imm8)) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_shufflehi_epi16 - #define _mm256_shufflehi_epi16(a, imm8) simde_mm256_shufflehi_epi16(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_shufflelo_epi16(a, imm8) _mm256_shufflelo_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_shufflelo_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) -# define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ - simde__m256i_from_private((simde__m256i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 32, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3), \ - 4, 5, 6, 7, \ - ((((imm8) ) & 3) + 8), \ - ((((imm8) >> 2) & 3) + 8), \ - ((((imm8) >> 4) & 3) + 8), \ - ((((imm8) >> 6) & 3) + 8), \ - 12, 13, 14, 15) }); })) -#else -# define simde_mm256_shufflelo_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \ - simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), imm8)) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_shufflelo_epi16 - #define _mm256_shufflelo_epi16(a, imm8) simde_mm256_shufflelo_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sign_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sign_epi8 - #define _mm256_sign_epi8(a, b) simde_mm256_sign_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sign_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sign_epi16 - #define _mm256_sign_epi16(a, b) simde_mm256_sign_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sign_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { - r_.i32[i] = (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sign_epi32 - #define _mm256_sign_epi32(a, b) simde_mm256_sign_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sll_epi16 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sll_epi16(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sll_epi16(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sll_epi16(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 15) - return simde_mm256_setzero_si256(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (shift)); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sll_epi16 - #define _mm256_sll_epi16(a, count) simde_mm256_sll_epi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sll_epi32 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sll_epi32(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sll_epi32(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sll_epi32(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 31) - return simde_mm256_setzero_si256(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] << (shift)); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sll_epi32 - #define _mm256_sll_epi32(a, count) simde_mm256_sll_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sll_epi64 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sll_epi64(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sll_epi64(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sll_epi64(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 63) - return simde_mm256_setzero_si256(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] << (shift)); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sll_epi64 - #define _mm256_sll_epi64(a, count) simde_mm256_sll_epi64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_slli_epi16 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - /* Note: There is no consistency in how compilers handle values outside of - the expected range, hence the discrepancy between what we allow and what - Intel specifies. Some compilers will return 0, others seem to just mask - off everything outside of the range. */ - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8)); - for (size_t i = 0 ; i < (sizeof(a_.altivec_i16) / sizeof(a_.altivec_i16[0])) ; i++) { - r_.altivec_i16[i] = vec_sl(a_.altivec_i16[i], sv); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (imm8 & 0xff)); - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_slli_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_slli_epi16 - #define _mm256_slli_epi16(a, imm8) simde_mm256_slli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_slli_epi32 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)); - for (size_t i = 0 ; i < (sizeof(a_.altivec_i32) / sizeof(a_.altivec_i32[0])) ; i++) { - r_.altivec_i32[i] = vec_sl(a_.altivec_i32[i], sv); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] << (imm8 & 0xff); - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_slli_epi32(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_slli_epi32 - #define _mm256_slli_epi32(a, imm8) simde_mm256_slli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_slli_epi64 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - -#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, imm8); -#else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] << (imm8 & 0xff); - } -#endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_slli_epi64(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_slli_epi64 - #define _mm256_slli_epi64(a, imm8) simde_mm256_slli_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_slli_si256 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) { - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) { - const int e = HEDLEY_STATIC_CAST(int, i) - imm8; - r_.m128i_private[h].i8[i] = (e >= 0) ? a_.m128i_private[h].i8[e] : 0; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_slli_si256(a, imm8) _mm256_slli_si256(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) -# define simde_mm256_slli_si256(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm256_slli_si256(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_slli_si256 - #define _mm256_slli_si256(a, imm8) simde_mm256_slli_si256(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sllv_epi32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, vreinterpretq_s32_u32(b_.neon_u32)); - r_.neon_u32 = vandq_u32(r_.neon_u32, vcltq_u32(b_.neon_u32, vdupq_n_u32(32))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < UINT32_C(32))) & (a_.u32 << b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_sllv_epi32(a, b) _mm_sllv_epi32(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_sllv_epi32 - #define _mm_sllv_epi32(a, b) simde_mm_sllv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sllv_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sllv_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sllv_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 << b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_sllv_epi32(a, b) _mm256_sllv_epi32(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sllv_epi32 - #define _mm256_sllv_epi32(a, b) simde_mm256_sllv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sllv_epi64 (simde__m128i a, simde__m128i b) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vreinterpretq_s64_u64(b_.neon_u64)); - r_.neon_u64 = vandq_u64(r_.neon_u64, vcltq_u64(b_.neon_u64, vdupq_n_u64(64))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_sllv_epi64(a, b) _mm_sllv_epi64(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_sllv_epi64 - #define _mm_sllv_epi64(a, b) simde_mm_sllv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sllv_epi64 (simde__m256i a, simde__m256i b) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sllv_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sllv_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_sllv_epi64(a, b) _mm256_sllv_epi64(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sllv_epi64 - #define _mm256_sllv_epi64(a, b) simde_mm256_sllv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sra_epi16 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sra_epi16(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sra_epi16(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sra_epi16(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - - if (shift > 15) shift = 15; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> shift; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sra_epi16 - #define _mm256_sra_epi16(a, count) simde_mm256_sra_epi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sra_epi32 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sra_epi32(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sra_epi32(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sra_epi32(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - - if (shift > 31) shift = 31; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> shift; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sra_epi32 - #define _mm256_sra_epi32(a, count) simde_mm256_sra_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srai_epi16 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8); - - if (shift > 15) shift = 15; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> shift; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srai_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srai_epi16 - #define _mm256_srai_epi16(a, imm8) simde_mm256_srai_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srai_epi32 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8); - - if (shift > 31) shift = 31; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> shift; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srai_epi32(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srai_epi32 - #define _mm256_srai_epi32(a, imm8) simde_mm256_srai_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srav_epi32 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_srav_epi32(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4_t cnt = vreinterpretq_s32_u32(vminq_u32(count_.neon_u32, vdupq_n_u32(31))); - r_.neon_i32 = vshlq_s32(a_.neon_i32, vnegq_s32(cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]); - r_.i32[i] = a_.i32[i] >> HEDLEY_STATIC_CAST(int, shift > 31 ? 31 : shift); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_srav_epi32 - #define _mm_srav_epi32(a, count) simde_mm_srav_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srav_epi32 (simde__m256i a, simde__m256i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_srav_epi32(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - count_ = simde__m256i_to_private(count); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_srav_epi32(a_.m128i[0], count_.m128i[0]); - r_.m128i[1] = simde_mm_srav_epi32(a_.m128i[1], count_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]); - if (shift > 31) shift = 31; - r_.i32[i] = a_.i32[i] >> shift; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srav_epi32 - #define _mm256_srav_epi32(a, count) simde_mm256_srav_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srl_epi16 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_srl_epi16(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_srl_epi16(a_.m128i[0], count); - r_.m128i[1] = simde_mm_srl_epi16(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 16 ? 16 : count_.i64[0])); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> (shift); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srl_epi16 - #define _mm256_srl_epi16(a, count) simde_mm256_srl_epi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srl_epi32 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_srl_epi32(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_srl_epi32(a_.m128i[0], count); - r_.m128i[1] = simde_mm_srl_epi32(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 32 ? 32 : count_.i64[0])); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> (shift); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srl_epi32 - #define _mm256_srl_epi32(a, count) simde_mm256_srl_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srl_epi64 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_srl_epi64(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_srl_epi64(a_.m128i[0], count); - r_.m128i[1] = simde_mm_srl_epi64(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 64 ? 64 : count_.i64[0])); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(64, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> (shift); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srl_epi64 - #define _mm256_srl_epi64(a, count) simde_mm256_srl_epi64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srli_epi16 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - if (imm8 > 15) - return simde_mm256_setzero_si256(); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8)); - for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) { - r_.altivec_u16[i] = vec_sr(a_.altivec_u16[i], sv); - } - #else - if (HEDLEY_STATIC_CAST(unsigned int, imm8) > 15) { - simde_memset(&r_, 0, sizeof(r_)); - } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> imm8; - } - #endif - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srli_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srli_epi16 - #define _mm256_srli_epi16(a, imm8) simde_mm256_srli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srli_epi32 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)); - for (size_t i = 0 ; i < (sizeof(a_.altivec_u32) / sizeof(a_.altivec_u32[0])) ; i++) { - r_.altivec_u32[i] = vec_sr(a_.altivec_u32[i], sv); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> imm8; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srli_epi32(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srli_epi32 - #define _mm256_srli_epi32(a, imm8) simde_mm256_srli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srli_epi64 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - -#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, imm8); -#else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> imm8; - } -#endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srli_epi64(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srli_epi64 - #define _mm256_srli_epi64(a, imm8) simde_mm256_srli_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srli_si256 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) { - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) { - const int e = imm8 + HEDLEY_STATIC_CAST(int, i); - r_.m128i_private[h].i8[i] = (e < 16) ? a_.m128i_private[h].i8[e] : 0; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srli_si256(a, imm8) _mm256_srli_si256(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) -# define simde_mm256_srli_si256(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm256_srli_si256(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srli_si256 - #define _mm256_srli_si256(a, imm8) simde_mm256_srli_si256(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srlv_epi32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 >> b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_srlv_epi32(a, b) _mm_srlv_epi32(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_srlv_epi32 - #define _mm_srlv_epi32(a, b) simde_mm_srlv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srlv_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 >> b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_srlv_epi32(a, b) _mm256_srlv_epi32(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srlv_epi32 - #define _mm256_srlv_epi32(a, b) simde_mm256_srlv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srlv_epi64 (simde__m128i a, simde__m128i b) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 >> b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_srlv_epi64(a, b) _mm_srlv_epi64(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_srlv_epi64 - #define _mm_srlv_epi64(a, b) simde_mm_srlv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srlv_epi64 (simde__m256i a, simde__m256i b) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 >> b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_srlv_epi64(a, b) _mm256_srlv_epi64(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srlv_epi64 - #define _mm256_srlv_epi64(a, b) simde_mm256_srlv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_stream_load_si256 (const simde__m256i* mem_addr) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_stream_load_si256(HEDLEY_CONST_CAST(simde__m256i*, mem_addr)); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) - return __builtin_nontemporal_load(mem_addr); - #else - simde__m256i r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_stream_load_si256(mem_addr) simde_mm256_stream_load_si256(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sub_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sub_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sub_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sub_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 - b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] - b_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_epi8 - #define _mm256_sub_epi8(a, b) simde_mm256_sub_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sub_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sub_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sub_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sub_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 - b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] - b_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_epi16 - #define _mm256_sub_epi16(a, b) simde_mm256_sub_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hsub_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hsub_epi16(a, b); - #else - return simde_mm256_sub_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsub_epi16 - #define _mm256_hsub_epi16(a, b) simde_mm256_hsub_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sub_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sub_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sub_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sub_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 - b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] - b_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_epi32 - #define _mm256_sub_epi32(a, b) simde_mm256_sub_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hsub_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hsub_epi32(a, b); - #else - return simde_mm256_sub_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsub_epi32 - #define _mm256_hsub_epi32(a, b) simde_mm256_hsub_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sub_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sub_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sub_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sub_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 - b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] - b_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); -#endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_epi64 - #define _mm256_sub_epi64(a, b) simde_mm256_sub_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_sub_epu32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 - b_.u32; - #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_sub_epu32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_sub_epu32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] - b_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_subs_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_subs_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_subs_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_subs_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_subs_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_subs_epi8 - #define _mm256_subs_epi8(a, b) simde_mm256_subs_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_subs_epi16(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_subs_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_subs_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_subs_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_subs_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_subs_epi16 - #define _mm256_subs_epi16(a, b) simde_mm256_subs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hsubs_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hsubs_epi16(a, b); - #else - return simde_mm256_subs_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsubs_epi16 - #define _mm256_hsubs_epi16(a, b) simde_mm256_hsubs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_subs_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_subs_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_subs_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_subs_epu8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_subs_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_subs_epu8 - #define _mm256_subs_epu8(a, b) simde_mm256_subs_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_subs_epu16(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_subs_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_subs_epu16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_subs_epu16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_subs_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_subs_epu16 - #define _mm256_subs_epu16(a, b) simde_mm256_subs_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_x_mm256_test_all_ones (simde__m256i a) { - simde__m256i_private a_ = simde__m256i_to_private(a); - int r; - int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); - - SIMDE_VECTORIZE_REDUCTION(&:r_) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r_ &= a_.i32f[i]; - } - - r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0)); - - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpacklo_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpacklo_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpacklo_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpacklo_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8, - 0, 32, 1, 33, 2, 34, 3, 35, - 4, 36, 5, 37, 6, 38, 7, 39, - 16, 48, 17, 49, 18, 50, 19, 51, - 20, 52, 21, 53, 22, 54, 23, 55); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) { - r_.i8[2 * i] = a_.i8[i + ~(~i | 7)]; - r_.i8[2 * i + 1] = b_.i8[i + ~(~i | 7)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_epi8 - #define _mm256_unpacklo_epi8(a, b) simde_mm256_unpacklo_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpacklo_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpacklo_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpacklo_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpacklo_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 =SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, - 0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) { - r_.i16[2 * i] = a_.i16[i + ~(~i | 3)]; - r_.i16[2 * i + 1] = b_.i16[i + ~(~i | 3)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_epi16 - #define _mm256_unpacklo_epi16(a, b) simde_mm256_unpacklo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpacklo_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpacklo_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpacklo_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpacklo_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, - 0, 8, 1, 9, 4, 12, 5, 13); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) { - r_.i32[2 * i] = a_.i32[i + ~(~i | 1)]; - r_.i32[2 * i + 1] = b_.i32[i + ~(~i | 1)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_epi32 - #define _mm256_unpacklo_epi32(a, b) simde_mm256_unpacklo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpacklo_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpacklo_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpacklo_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpacklo_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 0, 4, 2, 6); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) { - r_.i64[2 * i] = a_.i64[2 * i]; - r_.i64[2 * i + 1] = b_.i64[2 * i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_epi64 - #define _mm256_unpacklo_epi64(a, b) simde_mm256_unpacklo_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpackhi_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpackhi_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpackhi_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpackhi_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8, - 8, 40, 9, 41, 10, 42, 11, 43, - 12, 44, 13, 45, 14, 46, 15, 47, - 24, 56, 25, 57, 26, 58, 27, 59, - 28, 60, 29, 61, 30, 62, 31, 63); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) { - r_.i8[2 * i] = a_.i8[i + 8 + ~(~i | 7)]; - r_.i8[2 * i + 1] = b_.i8[i + 8 + ~(~i | 7)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_epi8 - #define _mm256_unpackhi_epi8(a, b) simde_mm256_unpackhi_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpackhi_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpackhi_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpackhi_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpackhi_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, - 4, 20, 5, 21, 6, 22, 7, 23, - 12, 28, 13, 29, 14, 30, 15, 31); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) { - r_.i16[2 * i] = a_.i16[i + 4 + ~(~i | 3)]; - r_.i16[2 * i + 1] = b_.i16[i + 4 + ~(~i | 3)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_epi16 - #define _mm256_unpackhi_epi16(a, b) simde_mm256_unpackhi_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpackhi_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpackhi_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpackhi_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpackhi_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, - 2, 10, 3, 11, 6, 14, 7, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) { - r_.i32[2 * i] = a_.i32[i + 2 + ~(~i | 1)]; - r_.i32[2 * i + 1] = b_.i32[i + 2 + ~(~i | 1)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_epi32 - #define _mm256_unpackhi_epi32(a, b) simde_mm256_unpackhi_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpackhi_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpackhi_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpackhi_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpackhi_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 1, 5, 3, 7); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) { - r_.i64[2 * i] = a_.i64[2 * i + 1]; - r_.i64[2 * i + 1] = b_.i64[2 * i + 1]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_epi64 - #define _mm256_unpackhi_epi64(a, b) simde_mm256_unpackhi_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_xor_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_xor_si256(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] ^ b_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_xor_si256 - #define _mm256_xor_si256(a, b) simde_mm256_xor_si256(a, b) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX2_H) */ diff --git a/extern/simde/x86/f16c.h b/extern/simde/x86/f16c.h deleted file mode 100644 index 9522bf6f6..000000000 --- a/extern/simde/x86/f16c.h +++ /dev/null @@ -1,172 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2021 Evan Nemerson - */ - -#include "../simde-common.h" -#include "../simde-math.h" -#include "../simde-f16.h" - -#if !defined(SIMDE_X86_F16C_H) -#define SIMDE_X86_F16C_H - -#include "avx.h" - -#if !defined(SIMDE_X86_PF16C_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_PF16C_ENABLE_NATIVE_ALIASES -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtps_ph(simde__m128 a, const int imm8) { - simde__m128_private a_ = simde__m128_to_private(a); - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - - HEDLEY_STATIC_CAST(void, imm8); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0))); - #elif defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f16[i] = simde_float16_from_float32(a_.f32[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_F16C_NATIVE) - #define simde_mm_cvtps_ph(a, imm8) _mm_cvtps_ph(a, imm8) -#endif -#if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm_cvtps_ph(a, sae) simde_mm_cvtps_ph(a, sae) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtph_ps(simde__m128i a) { - #if defined(SIMDE_X86_F16C_NATIVE) - return _mm_cvtph_ps(a); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - r_.neon_f32 = vcvt_f32_f16(vget_low_f16(a_.neon_f16)); - #elif defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(a_.f16[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm_cvtph_ps(a) simde_mm_cvtph_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvtps_ph(simde__m256 a, const int imm8) { - simde__m256_private a_ = simde__m256_to_private(a); - simde__m128i_private r_; - - HEDLEY_STATIC_CAST(void, imm8); - - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f16[i] = simde_float16_from_float32(a_.f32[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); - } - #endif - - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_F16C_NATIVE) - #define simde_mm256_cvtps_ph(a, imm8) _mm256_cvtps_ph(a, imm8) -#endif -#if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm256_cvtps_ph(a, imm8) simde_mm256_cvtps_ph(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_cvtph_ps(simde__m128i a) { - #if defined(SIMDE_X86_F16C_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtph_ps(a); - #elif defined(SIMDE_X86_F16C_NATIVE) - return _mm256_setr_m128( - _mm_cvtph_ps(a), - _mm_cvtph_ps(_mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(a), 0xee))) - ); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m256_private r_; - - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(a_.f16[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm256_cvtph_ps(a) simde_mm256_cvtph_ps(a) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_F16C_H) */ diff --git a/extern/simde/x86/fma.h b/extern/simde/x86/fma.h deleted file mode 100644 index 630efc54a..000000000 --- a/extern/simde/x86/fma.h +++ /dev/null @@ -1,732 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2019 Evan Nemerson - */ - -#if !defined(SIMDE_X86_FMA_H) -#define SIMDE_X86_FMA_H - -#include "avx.h" - -#if !defined(SIMDE_X86_FMA_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmadd_pd(a, b, c); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c), - r_; - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_madd(a_.altivec_f64, b_.altivec_f64, c_.altivec_f64); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vfmaq_f64(c_.neon_f64, b_.neon_f64, a_.neon_f64); - #elif defined(simde_math_fma) && (defined(__FP_FAST_FMA) || defined(FP_FAST_FMA)) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_fma(a_.f64[i], b_.f64[i], c_.f64[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] * b_.f64[i]) + c_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmadd_pd - #define _mm_fmadd_pd(a, b, c) simde_mm_fmadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmadd_pd(a, b, c); - #else - return simde_mm256_add_pd(simde_mm256_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmadd_pd - #define _mm256_fmadd_pd(a, b, c) simde_mm256_fmadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmadd_ps(a, b, c); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c), - r_; - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_madd(a_.altivec_f32, b_.altivec_f32, c_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) - r_.neon_f32 = vfmaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmlaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); - #elif defined(simde_math_fmaf) && (defined(__FP_FAST_FMAF) || defined(FP_FAST_FMAF)) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_fmaf(a_.f32[i], b_.f32[i], c_.f32[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] * b_.f32[i]) + c_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmadd_ps - #define _mm_fmadd_ps(a, b, c) simde_mm_fmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmadd_ps(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - simde__m256_private - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - c_ = simde__m256_to_private(c), - r_; - - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_fmadd_ps(a_.m128[i], b_.m128[i], c_.m128[i]); - } - - return simde__m256_from_private(r_); - #else - return simde_mm256_add_ps(simde_mm256_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmadd_ps - #define _mm256_fmadd_ps(a, b, c) simde_mm256_fmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmadd_sd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fmadd_sd(a, b, c); - #else - return simde_mm_add_sd(simde_mm_mul_sd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmadd_sd - #define _mm_fmadd_sd(a, b, c) simde_mm_fmadd_sd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmadd_ss (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fmadd_ss(a, b, c); - #else - return simde_mm_add_ss(simde_mm_mul_ss(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmadd_ss - #define _mm_fmadd_ss(a, b, c) simde_mm_fmadd_ss(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmaddsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmaddsub_pd(a, b, c); - #else - return simde_mm_addsub_pd(simde_mm_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmaddsub_pd - #define _mm_fmaddsub_pd(a, b, c) simde_mm_fmaddsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fmaddsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmaddsub_pd(a, b, c); - #else - return simde_mm256_addsub_pd(simde_mm256_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmaddsub_pd - #define _mm256_fmaddsub_pd(a, b, c) simde_mm256_fmaddsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmaddsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmaddsub_ps(a, b, c); - #else - return simde_mm_addsub_ps(simde_mm_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmaddsub_ps - #define _mm_fmaddsub_ps(a, b, c) simde_mm_fmaddsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fmaddsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmaddsub_ps(a, b, c); - #else - return simde_mm256_addsub_ps(simde_mm256_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmaddsub_ps - #define _mm256_fmaddsub_ps(a, b, c) simde_mm256_fmaddsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmsub_pd(a, b, c); - #else - return simde_mm_sub_pd(simde_mm_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsub_pd - #define _mm_fmsub_pd(a, b, c) simde_mm_fmsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmsub_pd(a, b, c); - #else - return simde_mm256_sub_pd(simde_mm256_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmsub_pd - #define _mm256_fmsub_pd(a, b, c) simde_mm256_fmsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmsub_ps(a, b, c); - #else - return simde_mm_sub_ps(simde_mm_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsub_ps - #define _mm_fmsub_ps(a, b, c) simde_mm_fmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmsub_ps(a, b, c); - #else - return simde_mm256_sub_ps(simde_mm256_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmsub_ps - #define _mm256_fmsub_ps(a, b, c) simde_mm256_fmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmsub_sd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fmsub_sd(a, b, c); - #else - return simde_mm_sub_sd(simde_mm_mul_sd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsub_sd - #define _mm_fmsub_sd(a, b, c) simde_mm_fmsub_sd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmsub_ss (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fmsub_ss(a, b, c); - #else - return simde_mm_sub_ss(simde_mm_mul_ss(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsub_ss - #define _mm_fmsub_ss(a, b, c) simde_mm_fmsub_ss(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmsubadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmsubadd_pd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = (a_.f64[ i ] * b_.f64[ i ]) + c_.f64[ i ]; - r_.f64[i + 1] = (a_.f64[i + 1] * b_.f64[i + 1]) - c_.f64[i + 1]; - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsubadd_pd - #define _mm_fmsubadd_pd(a, b, c) simde_mm_fmsubadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fmsubadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmsubadd_pd(a, b, c); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - c_ = simde__m256d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = (a_.f64[ i ] * b_.f64[ i ]) + c_.f64[ i ]; - r_.f64[i + 1] = (a_.f64[i + 1] * b_.f64[i + 1]) - c_.f64[i + 1]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmsubadd_pd - #define _mm256_fmsubadd_pd(a, b, c) simde_mm256_fmsubadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmsubadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmsubadd_ps(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = (a_.f32[ i ] * b_.f32[ i ]) + c_.f32[ i ]; - r_.f32[i + 1] = (a_.f32[i + 1] * b_.f32[i + 1]) - c_.f32[i + 1]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsubadd_ps - #define _mm_fmsubadd_ps(a, b, c) simde_mm_fmsubadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fmsubadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmsubadd_ps(a, b, c); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - c_ = simde__m256_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = (a_.f32[ i ] * b_.f32[ i ]) + c_.f32[ i ]; - r_.f32[i + 1] = (a_.f32[i + 1] * b_.f32[i + 1]) - c_.f32[i + 1]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmsubadd_ps - #define _mm256_fmsubadd_ps(a, b, c) simde_mm256_fmsubadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fnmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fnmadd_pd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vfmsq_f64(c_.neon_f64, a_.neon_f64, b_.neon_f64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) + c_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmadd_pd - #define _mm_fnmadd_pd(a, b, c) simde_mm_fnmadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fnmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fnmadd_pd(a, b, c); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - c_ = simde__m256d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) + c_.f64[i]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fnmadd_pd - #define _mm256_fnmadd_pd(a, b, c) simde_mm256_fnmadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fnmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fnmadd_ps(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) - r_.neon_f32 = vfmsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmlsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) + c_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmadd_ps - #define _mm_fnmadd_ps(a, b, c) simde_mm_fnmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fnmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fnmadd_ps(a, b, c); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - c_ = simde__m256_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) + c_.f32[i]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fnmadd_ps - #define _mm256_fnmadd_ps(a, b, c) simde_mm256_fnmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fnmadd_sd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fnmadd_sd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - r_ = a_; - r_.f64[0] = -(a_.f64[0] * b_.f64[0]) + c_.f64[0]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmadd_sd - #define _mm_fnmadd_sd(a, b, c) simde_mm_fnmadd_sd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fnmadd_ss (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fnmadd_ss(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - r_ = a_; - r_.f32[0] = -(a_.f32[0] * b_.f32[0]) + c_.f32[0]; - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmadd_ss - #define _mm_fnmadd_ss(a, b, c) simde_mm_fnmadd_ss(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fnmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fnmsub_pd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) - c_.f64[i]; - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmsub_pd - #define _mm_fnmsub_pd(a, b, c) simde_mm_fnmsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fnmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fnmsub_pd(a, b, c); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - c_ = simde__m256d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) - c_.f64[i]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fnmsub_pd - #define _mm256_fnmsub_pd(a, b, c) simde_mm256_fnmsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fnmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fnmsub_ps(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) - c_.f32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmsub_ps - #define _mm_fnmsub_ps(a, b, c) simde_mm_fnmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fnmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fnmsub_ps(a, b, c); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - c_ = simde__m256_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) - c_.f32[i]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fnmsub_ps - #define _mm256_fnmsub_ps(a, b, c) simde_mm256_fnmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fnmsub_sd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fnmsub_sd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - r_ = a_; - r_.f64[0] = -(a_.f64[0] * b_.f64[0]) - c_.f64[0]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmsub_sd - #define _mm_fnmsub_sd(a, b, c) simde_mm_fnmsub_sd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fnmsub_ss (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fnmsub_ss(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - r_ = simde__m128_to_private(a); - r_.f32[0] = -(a_.f32[0] * b_.f32[0]) - c_.f32[0]; - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmsub_ss - #define _mm_fnmsub_ss(a, b, c) simde_mm_fnmsub_ss(a, b, c) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_FMA_H) */ diff --git a/extern/simde/x86/mmx.h b/extern/simde/x86/mmx.h deleted file mode 100644 index e294af8e9..000000000 --- a/extern/simde/x86/mmx.h +++ /dev/null @@ -1,2398 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_MMX_H) -#define SIMDE_X86_MMX_H - -#include "../simde-common.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS - -#if defined(SIMDE_X86_MMX_NATIVE) - #define SIMDE_X86_MMX_USE_NATIVE_TYPE -#elif defined(SIMDE_X86_SSE_NATIVE) - #define SIMDE_X86_MMX_USE_NATIVE_TYPE -#endif - -#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) - #include -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #include -#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - #include -#endif - -#include -#include - -SIMDE_BEGIN_DECLS_ - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_8 int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_8 int8_t i8[8]; - SIMDE_ALIGN_TO_8 int16_t i16[4]; - SIMDE_ALIGN_TO_8 int32_t i32[2]; - SIMDE_ALIGN_TO_8 int64_t i64[1]; - SIMDE_ALIGN_TO_8 uint8_t u8[8]; - SIMDE_ALIGN_TO_8 uint16_t u16[4]; - SIMDE_ALIGN_TO_8 uint32_t u32[2]; - SIMDE_ALIGN_TO_8 uint64_t u64[1]; - SIMDE_ALIGN_TO_8 simde_float32 f32[2]; - SIMDE_ALIGN_TO_8 int_fast32_t i32f[8 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_8 uint_fast32_t u32f[8 / sizeof(uint_fast32_t)]; - #endif - - #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) - __m64 n; - #endif - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x8_t neon_i8; - int16x4_t neon_i16; - int32x2_t neon_i32; - int64x1_t neon_i64; - uint8x8_t neon_u8; - uint16x4_t neon_u16; - uint32x2_t neon_u32; - uint64x1_t neon_u64; - float32x2_t neon_f32; - #endif - #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - int8x8_t mmi_i8; - int16x4_t mmi_i16; - int32x2_t mmi_i32; - int64_t mmi_i64; - uint8x8_t mmi_u8; - uint16x4_t mmi_u16; - uint32x2_t mmi_u32; - uint64_t mmi_u64; - #endif -} simde__m64_private; - -#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) - typedef __m64 simde__m64; -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - typedef int32x2_t simde__m64; -#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - typedef int32x2_t simde__m64; -#elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef int32_t simde__m64 SIMDE_ALIGN_TO_8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; -#else - typedef simde__m64_private simde__m64; -#endif - -#if !defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) - #define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES - typedef simde__m64 __m64; -#endif - -HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect"); -HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect"); -#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8, "simde__m64 is not 8-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8, "simde__m64_private is not 8-byte aligned"); -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde__m64_from_private(simde__m64_private v) { - simde__m64 r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64_private -simde__m64_to_private(simde__m64 v) { - simde__m64_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -#define SIMDE_X86_GENERATE_CONVERSION_FUNCTION(simde_type, source_type, isax, fragment) \ - SIMDE_FUNCTION_ATTRIBUTES \ - simde__##simde_type \ - simde__##simde_type##_from_##isax##_##fragment(source_type value) { \ - simde__##simde_type##_private r_; \ - r_.isax##_##fragment = value; \ - return simde__##simde_type##_from_private(r_); \ - } \ - \ - SIMDE_FUNCTION_ATTRIBUTES \ - source_type \ - simde__##simde_type##_to_##isax##_##fragment(simde__##simde_type value) { \ - simde__##simde_type##_private r_ = simde__##simde_type##_to_private(value); \ - return r_.isax##_##fragment; \ - } - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, neon, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, neon, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, neon, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64x1_t, neon, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, neon, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, neon, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, neon, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32) -#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ - -#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, mmi, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, mmi, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, mmi, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64_t, mmi, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, mmi, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, mmi, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, mmi, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64_t, mmi, u64) -#endif /* defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) */ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_add_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_add_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = paddb_s(a_.mmi_i8, b_.mmi_i8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 + b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] + b_.i8[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddb(a, b) simde_mm_add_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b) -# define _m_paddb(a, b) simde_m_paddb(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_add_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_add_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = paddh_s(a_.mmi_i16, b_.mmi_i16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 + b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] + b_.i16[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddw(a, b) simde_mm_add_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b) -# define _m_paddw(a, b) simde_mm_add_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_add_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_add_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = paddw_s(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 + b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] + b_.i32[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddd(a, b) simde_mm_add_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b) -# define _m_paddd(a, b) simde_mm_add_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_adds_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_adds_pi8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = paddsb(a_.mmi_i8, b_.mmi_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - if ((((b_.i8[i]) > 0) && ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) { - r_.i8[i] = INT8_MAX; - } else if ((((b_.i8[i]) < 0) && ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) { - r_.i8[i] = INT8_MIN; - } else { - r_.i8[i] = (a_.i8[i]) + (b_.i8[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b) -# define _m_paddsb(a, b) simde_mm_adds_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_adds_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_adds_pu8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u8 = paddusb(a_.mmi_u8, b_.mmi_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - const uint_fast16_t x = HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) + HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]); - if (x > UINT8_MAX) - r_.u8[i] = UINT8_MAX; - else - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b) -# define _m_paddusb(a, b) simde_mm_adds_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_adds_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_adds_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = paddsh(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if ((((b_.i16[i]) > 0) && ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) { - r_.i16[i] = INT16_MAX; - } else if ((((b_.i16[i]) < 0) && ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) { - r_.i16[i] = SHRT_MIN; - } else { - r_.i16[i] = (a_.i16[i]) + (b_.i16[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b) -# define _m_paddsw(a, b) simde_mm_adds_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_adds_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_adds_pu16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u16 = paddush(a_.mmi_u16, b_.mmi_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const uint32_t x = a_.u16[i] + b_.u16[i]; - if (x > UINT16_MAX) - r_.u16[i] = UINT16_MAX; - else - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b) -# define _m_paddusw(a, b) simde_mm_adds_pu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_and_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_and_si64(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 & b_.i64; - #else - r_.i64[0] = a_.i64[0] & b_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pand(a, b) simde_mm_and_si64(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_and_si64(a, b) simde_mm_and_si64(a, b) -# define _m_pand(a, b) simde_mm_and_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_andnot_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_andnot_si64(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = pandn_sw(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]); - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b) -# define _m_pandn(a, b) simde_mm_andnot_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpeq_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpeq_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vceq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = pcmpeqb_s(a_.mmi_i8, b_.mmi_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b) -# define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpeq_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpeq_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vceq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = pcmpeqh_s(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b) -# define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpeq_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpeq_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vceq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = pcmpeqw_s(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b) -# define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpgt_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpgt_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgt_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = pcmpgtb_s(a_.mmi_i8, b_.mmi_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b) -# define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpgt_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpgt_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgt_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = pcmpgth_s(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b) -# define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpgt_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpgt_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgt_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = pcmpgtw_s(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b) -# define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvtm64_si64 (simde__m64 a) { - #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) - return _mm_cvtm64_si64(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - return vget_lane_s64(a_.neon_i64, 0); - HEDLEY_DIAGNOSTIC_POP - #else - return a_.i64[0]; - #endif - #endif -} -#define simde_m_to_int64(a) simde_mm_cvtm64_si64(a) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a) -# define _m_to_int64(a) simde_mm_cvtm64_si64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtsi32_si64 (int32_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtsi32_si64(a); - #else - simde__m64_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int32_t av[2] = { a, 0 }; - r_.neon_i32 = vld1_s32(av); - #else - r_.i32[0] = a; - r_.i32[1] = 0; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_from_int(a) simde_mm_cvtsi32_si64(a) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a) -# define _m_from_int(a) simde_mm_cvtsi32_si64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtsi64_m64 (int64_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) - return _mm_cvtsi64_m64(a); - #else - simde__m64_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vld1_s64(&a); - #else - r_.i64[0] = a; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a) -# define _m_from_int64(a) simde_mm_cvtsi64_m64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtsi64_si32 (simde__m64 a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtsi64_si32(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - return vget_lane_s32(a_.neon_i32, 0); - HEDLEY_DIAGNOSTIC_POP - #else - return a_.i32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_empty (void) { - #if defined(SIMDE_X86_MMX_NATIVE) - _mm_empty(); - #else - /* noop */ - #endif -} -#define simde_m_empty() simde_mm_empty() -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_empty() simde_mm_empty() -# define _m_empty() simde_mm_empty() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_madd_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_madd_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = pmaddhw(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i += 2) { - r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b) -# define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mulhi_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_mulhi_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16); - const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16); - const uint16x4_t t3 = vmovn_u32(t2); - r_.neon_u16 = t3; - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = pmulhh(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((a_.i16[i] * b_.i16[i]) >> 16)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b) -# define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mullo_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_mullo_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16); - const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1)); - r_.neon_u16 = t2; - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = pmullh(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b) -# define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_or_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_or_si64(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 | b_.i64; - #else - r_.i64[0] = a_.i64[0] | b_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_por(a, b) simde_mm_or_si64(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_or_si64(a, b) simde_mm_or_si64(a, b) -# define _m_por(a, b) simde_mm_or_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_packs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_packs_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = packsshb(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (a_.i16[i] < INT8_MIN) { - r_.i8[i] = INT8_MIN; - } else if (a_.i16[i] > INT8_MAX) { - r_.i8[i] = INT8_MAX; - } else { - r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]); - } - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (b_.i16[i] < INT8_MIN) { - r_.i8[i + 4] = INT8_MIN; - } else if (b_.i16[i] > INT8_MAX) { - r_.i8[i + 4] = INT8_MAX; - } else { - r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b) -# define _m_packsswb(a, b) simde_mm_packs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_packs_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_packs_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = packsswh(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (8 / sizeof(a_.i32[0])) ; i++) { - if (a_.i32[i] < SHRT_MIN) { - r_.i16[i] = SHRT_MIN; - } else if (a_.i32[i] > INT16_MAX) { - r_.i16[i] = INT16_MAX; - } else { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]); - } - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (8 / sizeof(b_.i32[0])) ; i++) { - if (b_.i32[i] < SHRT_MIN) { - r_.i16[i + 2] = SHRT_MIN; - } else if (b_.i32[i] > INT16_MAX) { - r_.i16[i + 2] = INT16_MAX; - } else { - r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b) -# define _m_packssdw(a, b) simde_mm_packs_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_packs_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_packs_pu16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16); - - /* Set elements which are < 0 to 0 */ - const int16x8_t t2 = vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1))); - - /* Vector with all s16 elements set to UINT8_MAX */ - const int16x8_t vmax = vmovq_n_s16(HEDLEY_STATIC_CAST(int16_t, UINT8_MAX)); - - /* Elements which are within the acceptable range */ - const int16x8_t le_max = vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax))); - const int16x8_t gt_max = vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax))); - - /* Final values as 16-bit integers */ - const int16x8_t values = vorrq_s16(le_max, gt_max); - - r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u8 = packushb(a_.mmi_u16, b_.mmi_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (a_.i16[i] > UINT8_MAX) { - r_.u8[i] = UINT8_MAX; - } else if (a_.i16[i] < 0) { - r_.u8[i] = 0; - } else { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]); - } - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (b_.i16[i] > UINT8_MAX) { - r_.u8[i + 4] = UINT8_MAX; - } else if (b_.i16[i] < 0) { - r_.u8[i + 4] = 0; - } else { - r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b) -# define _m_packuswb(a, b) simde_mm_packs_pu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m64_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.neon_i8 = vld1_s8(v); - #else - r_.i8[0] = e0; - r_.i8[1] = e1; - r_.i8[2] = e2; - r_.i8[3] = e3; - r_.i8[4] = e4; - r_.i8[5] = e5; - r_.i8[6] = e6; - r_.i8[7] = e7; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_pu8 (uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { - simde__m64_private r_; - - #if defined(SIMDE_X86_MMX_NATIVE) - r_.n = _mm_set_pi8( - HEDLEY_STATIC_CAST(int8_t, e7), - HEDLEY_STATIC_CAST(int8_t, e6), - HEDLEY_STATIC_CAST(int8_t, e5), - HEDLEY_STATIC_CAST(int8_t, e4), - HEDLEY_STATIC_CAST(int8_t, e3), - HEDLEY_STATIC_CAST(int8_t, e2), - HEDLEY_STATIC_CAST(int8_t, e1), - HEDLEY_STATIC_CAST(int8_t, e0)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.neon_u8 = vld1_u8(v); - #else - r_.u8[0] = e0; - r_.u8[1] = e1; - r_.u8[2] = e2; - r_.u8[3] = e3; - r_.u8[4] = e4; - r_.u8[5] = e5; - r_.u8[6] = e6; - r_.u8[7] = e7; - #endif - - return simde__m64_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set_pi16(e3, e2, e1, e0); - #else - simde__m64_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = { e0, e1, e2, e3 }; - r_.neon_i16 = vld1_s16(v); - #else - r_.i16[0] = e0; - r_.i16[1] = e1; - r_.i16[2] = e2; - r_.i16[3] = e3; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_pu16 (uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { - simde__m64_private r_; - -#if defined(SIMDE_X86_MMX_NATIVE) - r_.n = _mm_set_pi16( - HEDLEY_STATIC_CAST(int16_t, e3), - HEDLEY_STATIC_CAST(int16_t, e2), - HEDLEY_STATIC_CAST(int16_t, e1), - HEDLEY_STATIC_CAST(int16_t, e0) - ); -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = { e0, e1, e2, e3 }; - r_.neon_u16 = vld1_u16(v); -#else - r_.u16[0] = e0; - r_.u16[1] = e1; - r_.u16[2] = e2; - r_.u16[3] = e3; -#endif - - return simde__m64_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_pu32 (uint32_t e1, uint32_t e0) { - simde__m64_private r_; - -#if defined(SIMDE_X86_MMX_NATIVE) - r_.n = _mm_set_pi32( - HEDLEY_STATIC_CAST(int32_t, e1), - HEDLEY_STATIC_CAST(int32_t, e0)); -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = { e0, e1 }; - r_.neon_u32 = vld1_u32(v); -#else - r_.u32[0] = e0; - r_.u32[1] = e1; -#endif - - return simde__m64_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set_pi32 (int32_t e1, int32_t e0) { - simde__m64_private r_; - -#if defined(SIMDE_X86_MMX_NATIVE) - r_.n = _mm_set_pi32(e1, e0); -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = { e0, e1 }; - r_.neon_i32 = vld1_s32(v); -#else - r_.i32[0] = e0; - r_.i32[1] = e1; -#endif - - return simde__m64_from_private(r_); -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_pi64 (int64_t e0) { - simde__m64_private r_; - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = { e0 }; - r_.neon_i64 = vld1_s64(v); -#else - r_.i64[0] = e0; -#endif - - return simde__m64_from_private(r_); -} - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_f32x2 (simde_float32 e1, simde_float32 e0) { - simde__m64_private r_; - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = { e0, e1 }; - r_.neon_f32 = vld1_f32(v); -#else - r_.f32[0] = e0; - r_.f32[1] = e1; -#endif - - return simde__m64_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set1_pi8 (int8_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set1_pi8(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private r_; - r_.neon_i8 = vmov_n_s8(a); - return simde__m64_from_private(r_); - #else - return simde_mm_set_pi8(a, a, a, a, a, a, a, a); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set1_pi8(a) simde_mm_set1_pi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set1_pi16 (int16_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set1_pi16(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private r_; - r_.neon_i16 = vmov_n_s16(a); - return simde__m64_from_private(r_); - #else - return simde_mm_set_pi16(a, a, a, a); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set1_pi16(a) simde_mm_set1_pi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set1_pi32 (int32_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set1_pi32(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private r_; - r_.neon_i32 = vmov_n_s32(a); - return simde__m64_from_private(r_); - #else - return simde_mm_set_pi32(a, a); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set1_pi32(a) simde_mm_set1_pi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_setr_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_setr_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_setr_pi16(e3, e2, e1, e0); - #else - return simde_mm_set_pi16(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_setr_pi32 (int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_setr_pi32(e1, e0); - #else - return simde_mm_set_pi32(e0, e1); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_setzero_si64 (void) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_setzero_si64(); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private r_; - r_.neon_u32 = vmov_n_u32(0); - return simde__m64_from_private(r_); - #else - return simde_mm_set_pi32(0, 0); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_setzero_si64() simde_mm_setzero_si64() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_load_si64 (const void* mem_addr) { - simde__m64 r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64), sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_loadu_si64 (const void* mem_addr) { - simde__m64 r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_x_mm_store_si64 (void* mem_addr, simde__m64 value) { - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64), &value, sizeof(value)); -} - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_x_mm_storeu_si64 (void* mem_addr, simde__m64 value) { - simde_memcpy(mem_addr, &value, sizeof(value)); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_setone_si64 (void) { - return simde_mm_set1_pi32(~INT32_C(0)); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sll_pi16 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sll_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(HEDLEY_STATIC_CAST(int16_t, vget_lane_u64(count_.neon_u64, 0)))); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) - return simde_mm_setzero_si64(); - - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count_.u64[0]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << count_.u64[0]; - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count_.u64[0]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count) -# define _m_psllw(a, count) simde_mm_sll_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sll_pi32 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sll_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(HEDLEY_STATIC_CAST(int32_t, vget_lane_u64(count_.neon_u64, 0)))); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << count_.u64[0]; - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] << count_.u64[0]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count) -# define _m_pslld(a, count) simde_mm_sll_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_slli_pi16 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_slli_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psllh_s(a_.mmi_i16, count); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) - if (HEDLEY_UNLIKELY(count > 15)) - return simde_mm_setzero_si64(); - - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t) count)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count) -# define _m_psllwi(a, count) simde_mm_slli_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_slli_pi32 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_slli_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t) count)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = psllw_s(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] << count; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count) -# define _m_pslldi(a, count) simde_mm_slli_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_slli_si64 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_slli_si64(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t) count)); - #else - r_.u64[0] = a_.u64[0] << count; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count) -# define _m_psllqi(a, count) simde_mm_slli_si64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sll_si64 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sll_si64(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 << count_.i64; - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - r_.u64[0] = a_.u64[0] << count_.u64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psllq(a, count) simde_mm_sll_si64(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count) -# define _m_psllq(a, count) simde_mm_sll_si64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srl_pi16 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_srl_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) - return simde_mm_setzero_si64(); - - r_.u16 = a_.u16 >> HEDLEY_STATIC_CAST(uint16_t, count_.u64[0]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> count_.u64[0]; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t) vget_lane_u64(count_.neon_u64, 0)))); - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < sizeof(r_.u16) / sizeof(r_.u16[0]) ; i++) { - r_.u16[i] = a_.u16[i] >> count_.u64[0]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count) -# define _m_psrlw(a, count) simde_mm_srl_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srl_pi32 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_srl_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> count_.u64[0]; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t) vget_lane_u64(count_.neon_u64, 0)))); - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < sizeof(r_.u32) / sizeof(r_.u32[0]) ; i++) { - r_.u32[i] = a_.u32[i] >> count_.u64[0]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count) -# define _m_psrld(a, count) simde_mm_srl_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srli_pi16 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srli_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t) count))); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psrlh_s(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> count; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count) -# define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srli_pi32 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srli_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t) count))); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = psrlw_s(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> count; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count) -# define _m_psrldi(a, count) simde_mm_srli_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srli_si64 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srli_si64(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = a_.u64 >> count; - #else - r_.u64[0] = a_.u64[0] >> count; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count) -# define _m_psrlqi(a, count) simde_mm_srli_si64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srl_si64 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_srl_si64(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = a_.u64 >> count_.u64; - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - r_.u64[0] = a_.u64[0] >> count_.u64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count) -# define _m_psrlq(a, count) simde_mm_srl_si64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srai_pi16 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srai_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> (count & 0xff); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count))); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psrah_s(a_.mmi_i16, count); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> (count & 0xff); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count) -# define _m_psrawi(a, count) simde_mm_srai_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srai_pi32 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srai_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> (count & 0xff); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count))); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = psraw_s(a_.mmi_i32, count); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> (count & 0xff); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count) -# define _m_psradi(a, count) simde_mm_srai_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sra_pi16 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sra_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0])); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> cnt; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, vget_lane_u64(count_.neon_u64, 0)))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> cnt; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count) -# define _m_psraw(a, count) simde_mm_sra_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sra_pi32 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sra_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - const int32_t cnt = (count_.u64[0] > 31) ? 31 : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> cnt; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, vget_lane_u64(count_.neon_u64, 0)))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> cnt; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count) -# define _m_psrad(a, count) simde_mm_sra_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sub_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sub_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = psubb_s(a_.mmi_i8, b_.mmi_i8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 - b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] - b_.i8[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b) -# define _m_psubb(a, b) simde_mm_sub_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sub_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sub_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psubh_s(a_.mmi_i16, b_.mmi_i16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 - b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] - b_.i16[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b) -# define _m_psubw(a, b) simde_mm_sub_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sub_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sub_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = psubw_s(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 - b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] - b_.i32[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b) -# define _m_psubd(a, b) simde_mm_sub_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_subs_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_subs_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = psubsb(a_.mmi_i8, b_.mmi_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) { - r_.i8[i] = INT8_MIN; - } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) { - r_.i8[i] = INT8_MAX; - } else { - r_.i8[i] = (a_.i8[i]) - (b_.i8[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b) -# define _m_psubsb(a, b) simde_mm_subs_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_subs_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_subs_pu8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u8 = psubusb(a_.mmi_u8, b_.mmi_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - const int32_t x = a_.u8[i] - b_.u8[i]; - if (x < 0) { - r_.u8[i] = 0; - } else if (x > UINT8_MAX) { - r_.u8[i] = UINT8_MAX; - } else { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b) -# define _m_psubusb(a, b) simde_mm_subs_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_subs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_subs_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psubsh(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) { - r_.i16[i] = SHRT_MIN; - } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) { - r_.i16[i] = INT16_MAX; - } else { - r_.i16[i] = (a_.i16[i]) - (b_.i16[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b) -# define _m_psubsw(a, b) simde_mm_subs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_subs_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_subs_pu16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u16 = psubush(a_.mmi_u16, b_.mmi_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - const int x = a_.u16[i] - b_.u16[i]; - if (x < 0) { - r_.u16[i] = 0; - } else if (x > UINT16_MAX) { - r_.u16[i] = UINT16_MAX; - } else { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b) -# define _m_psubusw(a, b) simde_mm_subs_pu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpackhi_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpackhi_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = punpckhbh_s(a_.mmi_i8, b_.mmi_i8); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14, 7, 15); - #else - r_.i8[0] = a_.i8[4]; - r_.i8[1] = b_.i8[4]; - r_.i8[2] = a_.i8[5]; - r_.i8[3] = b_.i8[5]; - r_.i8[4] = a_.i8[6]; - r_.i8[5] = b_.i8[6]; - r_.i8[6] = a_.i8[7]; - r_.i8[7] = b_.i8[7]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b) -# define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpackhi_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpackhi_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = punpckhhw_s(a_.mmi_i16, b_.mmi_i16); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7); - #else - r_.i16[0] = a_.i16[2]; - r_.i16[1] = b_.i16[2]; - r_.i16[2] = a_.i16[3]; - r_.i16[3] = b_.i16[3]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b) -# define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpackhi_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpackhi_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = punpckhwd_s(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3); - #else - r_.i32[0] = a_.i32[1]; - r_.i32[1] = b_.i32[1]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b) -# define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpacklo_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpacklo_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = punpcklbh_s(a_.mmi_i8, b_.mmi_i8); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3, 11); - #else - r_.i8[0] = a_.i8[0]; - r_.i8[1] = b_.i8[0]; - r_.i8[2] = a_.i8[1]; - r_.i8[3] = b_.i8[1]; - r_.i8[4] = a_.i8[2]; - r_.i8[5] = b_.i8[2]; - r_.i8[6] = a_.i8[3]; - r_.i8[7] = b_.i8[3]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b) -# define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpacklo_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpacklo_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = punpcklhw_s(a_.mmi_i16, b_.mmi_i16); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5); - #else - r_.i16[0] = a_.i16[0]; - r_.i16[1] = b_.i16[0]; - r_.i16[2] = a_.i16[1]; - r_.i16[3] = b_.i16[1]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b) -# define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpacklo_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpacklo_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = punpcklwd_s(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2); - #else - r_.i32[0] = a_.i32[0]; - r_.i32[1] = b_.i32[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b) -# define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_xor_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_xor_si64(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - r_.u64[0] = a_.u64[0] ^ b_.u64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pxor(a, b) simde_mm_xor_si64(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b) -# define _m_pxor(a, b) simde_mm_xor_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_m_to_int (simde__m64 a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _m_to_int(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - return vget_lane_s32(a_.neon_i32, 0); - HEDLEY_DIAGNOSTIC_POP - #else - return a_.i32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _m_to_int(a) simde_m_to_int(a) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_MMX_H) */ diff --git a/extern/simde/x86/sse.h b/extern/simde/x86/sse.h deleted file mode 100644 index a3e060a26..000000000 --- a/extern/simde/x86/sse.h +++ /dev/null @@ -1,4830 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - * 2015-2017 John W. Ratcliff - * 2015 Brandon Rowlett - * 2015 Ken Fast - */ - -#if !defined(SIMDE_X86_SSE_H) -#define SIMDE_X86_SSE_H - -#include "mmx.h" -#include "../simde-f16.h" - -#if defined(_WIN32) && !defined(SIMDE_X86_SSE_NATIVE) && defined(_MSC_VER) - #define NOMINMAX - #include -#endif - -#if defined(__ARM_ACLE) - #include -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #endif - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 simde_float16 f16[8]; - #endif - SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 int8_t i8[16]; - SIMDE_ALIGN_TO_16 int16_t i16[8]; - SIMDE_ALIGN_TO_16 int32_t i32[4]; - SIMDE_ALIGN_TO_16 int64_t i64[2]; - SIMDE_ALIGN_TO_16 uint8_t u8[16]; - SIMDE_ALIGN_TO_16 uint16_t u16[8]; - SIMDE_ALIGN_TO_16 uint32_t u32[4]; - SIMDE_ALIGN_TO_16 uint64_t u64[2]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128[1]; - SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; - #endif - SIMDE_ALIGN_TO_16 simde_float16 f16[8]; - SIMDE_ALIGN_TO_16 simde_float32 f32[4]; - SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2]; - SIMDE_ALIGN_TO_16 simde__m64 m64[2]; - - #if defined(SIMDE_X86_SSE_NATIVE) - SIMDE_ALIGN_TO_16 __m128 n; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_TO_16 int8x16_t neon_i8; - SIMDE_ALIGN_TO_16 int16x8_t neon_i16; - SIMDE_ALIGN_TO_16 int32x4_t neon_i32; - SIMDE_ALIGN_TO_16 int64x2_t neon_i64; - SIMDE_ALIGN_TO_16 uint8x16_t neon_u8; - SIMDE_ALIGN_TO_16 uint16x8_t neon_u16; - SIMDE_ALIGN_TO_16 uint32x4_t neon_u32; - SIMDE_ALIGN_TO_16 uint64x2_t neon_u64; - SIMDE_ALIGN_TO_16 float32x4_t neon_f32; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_ALIGN_TO_16 float64x2_t neon_f64; - #endif - #elif defined(SIMDE_MIPS_MSA_NATIVE) - v16i8 msa_i8; - v8i16 msa_i16; - v4i32 msa_i32; - v2i64 msa_i64; - v16u8 msa_u8; - v8u16 msa_u16; - v4u32 msa_u32; - v2u64 msa_u64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_ALIGN_TO_16 v128_t wasm_v128; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; - #endif - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - v16i8 lsx_i8; - v8i16 lsx_i16; - v4i32 lsx_i32; - v2i64 lsx_i64; - v16u8 lsx_u8; - v8u16 lsx_u16; - v4u32 lsx_u32; - v2u64 lsx_u64; - v4f32 lsx_f32; - v2f64 lsx_f64; - #endif -} simde__m128_private; - -#if defined(SIMDE_X86_SSE_NATIVE) - typedef __m128 simde__m128; -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - typedef float32x4_t simde__m128; -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - typedef v128_t simde__m128; -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128; -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - typedef v4f32 simde__m128; -#elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef simde_float32 simde__m128 SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; -#else - typedef simde__m128_private simde__m128; -#endif - -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - typedef simde__m128 __m128; -#endif - -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private), "simde__m128_private size incorrect"); -#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16, "simde__m128 is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16, "simde__m128_private is not 16-byte aligned"); -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde__m128_from_private(simde__m128_private v) { - simde__m128 r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128_private -simde__m128_to_private(simde__m128 v) { - simde__m128_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int8x16_t, neon, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int16x8_t, neon, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int32x4_t, neon, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int64x2_t, neon, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint8x16_t, neon, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint16x8_t, neon, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint32x4_t, neon, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint64x2_t, neon, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float32x4_t, neon, f32) - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float64x2_t, neon, f64) - #endif -#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ - -#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32) - - #if defined(SIMDE_BUG_GCC_95782) - SIMDE_FUNCTION_ATTRIBUTES - SIMDE_POWER_ALTIVEC_VECTOR(float) - simde__m128_to_altivec_f32(simde__m128 value) { - simde__m128_private r_ = simde__m128_to_private(value); - return r_.altivec_f32; - } - - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde__m128_from_altivec_f32(SIMDE_POWER_ALTIVEC_VECTOR(float) value) { - simde__m128_private r_; - r_.altivec_f32 = value; - return simde__m128_from_private(r_); - } - #else - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(float), altivec, f32) - #endif - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64) - #endif -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v128_t, wasm, v128); -#endif /* defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) */ - -#if defined(SIMDE_LOONGARCH_LSX_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v16i8, lsx, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v8i16, lsx, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4i32, lsx, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2i64, lsx, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v16u8, lsx, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v8u16, lsx, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4u32, lsx, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2u64, lsx, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4f32, lsx, f32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2f64, lsx, f64) -#endif /* defined(SIMDE_LOONGARCH_LSX_NATIVE) */ - -enum { - #if defined(SIMDE_X86_SSE_NATIVE) - SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST, - SIMDE_MM_ROUND_DOWN = _MM_ROUND_DOWN, - SIMDE_MM_ROUND_UP = _MM_ROUND_UP, - SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO - #else - SIMDE_MM_ROUND_NEAREST = 0x0000, - SIMDE_MM_ROUND_DOWN = 0x2000, - SIMDE_MM_ROUND_UP = 0x4000, - SIMDE_MM_ROUND_TOWARD_ZERO = 0x6000 - #endif -}; -#if defined(_MM_ROUND_MASK) -# define SIMDE_MM_ROUND_MASK _MM_ROUND_MASK -#else -# define SIMDE_MM_ROUND_MASK (0x6000) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_ROUND_MASK SIMDE_MM_ROUND_MASK -#endif - -#if defined(_MM_FROUND_TO_NEAREST_INT) -# define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT -# define SIMDE_MM_FROUND_TO_NEG_INF _MM_FROUND_TO_NEG_INF -# define SIMDE_MM_FROUND_TO_POS_INF _MM_FROUND_TO_POS_INF -# define SIMDE_MM_FROUND_TO_ZERO _MM_FROUND_TO_ZERO -# define SIMDE_MM_FROUND_CUR_DIRECTION _MM_FROUND_CUR_DIRECTION - -# define SIMDE_MM_FROUND_RAISE_EXC _MM_FROUND_RAISE_EXC -# define SIMDE_MM_FROUND_NO_EXC _MM_FROUND_NO_EXC -#else -# define SIMDE_MM_FROUND_TO_NEAREST_INT 0x00 -# define SIMDE_MM_FROUND_TO_NEG_INF 0x01 -# define SIMDE_MM_FROUND_TO_POS_INF 0x02 -# define SIMDE_MM_FROUND_TO_ZERO 0x03 -# define SIMDE_MM_FROUND_CUR_DIRECTION 0x04 - -# define SIMDE_MM_FROUND_RAISE_EXC 0x00 -# define SIMDE_MM_FROUND_NO_EXC 0x08 -#endif - -#define SIMDE_MM_FROUND_NINT \ - (SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_FLOOR \ - (SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_CEIL \ - (SIMDE_MM_FROUND_TO_POS_INF | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_TRUNC \ - (SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_RINT \ - (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_NEARBYINT \ - (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_NO_EXC) - -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) && !defined(_MM_FROUND_TO_NEAREST_INT) -# define _MM_FROUND_TO_NEAREST_INT SIMDE_MM_FROUND_TO_NEAREST_INT -# define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF -# define _MM_FROUND_TO_POS_INF SIMDE_MM_FROUND_TO_POS_INF -# define _MM_FROUND_TO_ZERO SIMDE_MM_FROUND_TO_ZERO -# define _MM_FROUND_CUR_DIRECTION SIMDE_MM_FROUND_CUR_DIRECTION -# define _MM_FROUND_RAISE_EXC SIMDE_MM_FROUND_RAISE_EXC -# define _MM_FROUND_NINT SIMDE_MM_FROUND_NINT -# define _MM_FROUND_FLOOR SIMDE_MM_FROUND_FLOOR -# define _MM_FROUND_CEIL SIMDE_MM_FROUND_CEIL -# define _MM_FROUND_TRUNC SIMDE_MM_FROUND_TRUNC -# define _MM_FROUND_RINT SIMDE_MM_FROUND_RINT -# define _MM_FROUND_NEARBYINT SIMDE_MM_FROUND_NEARBYINT -#endif - -#if defined(_MM_EXCEPT_INVALID) -# define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID -#else -# define SIMDE_MM_EXCEPT_INVALID (0x0001) -#endif -#if defined(_MM_EXCEPT_DENORM) -# define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM -#else -# define SIMDE_MM_EXCEPT_DENORM (0x0002) -#endif -#if defined(_MM_EXCEPT_DIV_ZERO) -# define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO -#else -# define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004) -#endif -#if defined(_MM_EXCEPT_OVERFLOW) -# define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW -#else -# define SIMDE_MM_EXCEPT_OVERFLOW (0x0008) -#endif -#if defined(_MM_EXCEPT_UNDERFLOW) -# define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW -#else -# define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010) -#endif -#if defined(_MM_EXCEPT_INEXACT) -# define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT -#else -# define SIMDE_MM_EXCEPT_INEXACT (0x0020) -#endif -#if defined(_MM_EXCEPT_MASK) -# define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK -#else -# define SIMDE_MM_EXCEPT_MASK \ - (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM | \ - SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \ - SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_EXCEPT_INVALID SIMDE_MM_EXCEPT_INVALID - #define _MM_EXCEPT_DENORM SIMDE_MM_EXCEPT_DENORM - #define _MM_EXCEPT_DIV_ZERO SIMDE_MM_EXCEPT_DIV_ZERO - #define _MM_EXCEPT_OVERFLOW SIMDE_MM_EXCEPT_OVERFLOW - #define _MM_EXCEPT_UNDERFLOW SIMDE_MM_EXCEPT_UNDERFLOW - #define _MM_EXCEPT_INEXACT SIMDE_MM_EXCEPT_INEXACT - #define _MM_EXCEPT_MASK SIMDE_MM_EXCEPT_MASK -#endif - -#if defined(_MM_MASK_INVALID) -# define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID -#else -# define SIMDE_MM_MASK_INVALID (0x0080) -#endif -#if defined(_MM_MASK_DENORM) -# define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM -#else -# define SIMDE_MM_MASK_DENORM (0x0100) -#endif -#if defined(_MM_MASK_DIV_ZERO) -# define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO -#else -# define SIMDE_MM_MASK_DIV_ZERO (0x0200) -#endif -#if defined(_MM_MASK_OVERFLOW) -# define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW -#else -# define SIMDE_MM_MASK_OVERFLOW (0x0400) -#endif -#if defined(_MM_MASK_UNDERFLOW) -# define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW -#else -# define SIMDE_MM_MASK_UNDERFLOW (0x0800) -#endif -#if defined(_MM_MASK_INEXACT) -# define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT -#else -# define SIMDE_MM_MASK_INEXACT (0x1000) -#endif -#if defined(_MM_MASK_MASK) -# define SIMDE_MM_MASK_MASK _MM_MASK_MASK -#else -# define SIMDE_MM_MASK_MASK \ - (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM | \ - SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \ - SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_MASK_INVALID SIMDE_MM_MASK_INVALID - #define _MM_MASK_DENORM SIMDE_MM_MASK_DENORM - #define _MM_MASK_DIV_ZERO SIMDE_MM_MASK_DIV_ZERO - #define _MM_MASK_OVERFLOW SIMDE_MM_MASK_OVERFLOW - #define _MM_MASK_UNDERFLOW SIMDE_MM_MASK_UNDERFLOW - #define _MM_MASK_INEXACT SIMDE_MM_MASK_INEXACT - #define _MM_MASK_MASK SIMDE_MM_MASK_MASK -#endif - -#if defined(_MM_FLUSH_ZERO_MASK) -# define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK -#else -# define SIMDE_MM_FLUSH_ZERO_MASK (0x8000) -#endif -#if defined(_MM_FLUSH_ZERO_ON) -# define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON -#else -# define SIMDE_MM_FLUSH_ZERO_ON (0x8000) -#endif -#if defined(_MM_FLUSH_ZERO_OFF) -# define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF -#else -# define SIMDE_MM_FLUSH_ZERO_OFF (0x0000) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_FLUSH_ZERO_MASK SIMDE_MM_FLUSH_ZERO_MASK - #define _MM_FLUSH_ZERO_ON SIMDE_MM_FLUSH_ZERO_ON - #define _MM_FLUSH_ZERO_OFF SIMDE_MM_FLUSH_ZERO_OFF -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -SIMDE_MM_GET_ROUNDING_MODE(void) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _MM_GET_ROUNDING_MODE(); - #elif defined(SIMDE_HAVE_FENV_H) - unsigned int vfe_mode; - - switch (fegetround()) { - #if defined(FE_TONEAREST) - case FE_TONEAREST: - vfe_mode = SIMDE_MM_ROUND_NEAREST; - break; - #endif - - #if defined(FE_TOWARDZERO) - case FE_TOWARDZERO: - vfe_mode = SIMDE_MM_ROUND_DOWN; - break; - #endif - - #if defined(FE_UPWARD) - case FE_UPWARD: - vfe_mode = SIMDE_MM_ROUND_UP; - break; - #endif - - #if defined(FE_DOWNWARD) - case FE_DOWNWARD: - vfe_mode = SIMDE_MM_ROUND_TOWARD_ZERO; - break; - #endif - - default: - vfe_mode = SIMDE_MM_ROUND_NEAREST; - break; - } - - return vfe_mode; - #else - return SIMDE_MM_ROUND_NEAREST; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_GET_ROUNDING_MODE() SIMDE_MM_GET_ROUNDING_MODE() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -SIMDE_MM_SET_ROUNDING_MODE(uint32_t a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _MM_SET_ROUNDING_MODE(a); - #elif defined(SIMDE_HAVE_FENV_H) - int fe_mode = FE_TONEAREST; - - switch (a) { - #if defined(FE_TONEAREST) - case SIMDE_MM_ROUND_NEAREST: - fe_mode = FE_TONEAREST; - break; - #endif - - #if defined(FE_TOWARDZERO) - case SIMDE_MM_ROUND_TOWARD_ZERO: - fe_mode = FE_TOWARDZERO; - break; - #endif - - #if defined(FE_DOWNWARD) - case SIMDE_MM_ROUND_DOWN: - fe_mode = FE_DOWNWARD; - break; - #endif - - #if defined(FE_UPWARD) - case SIMDE_MM_ROUND_UP: - fe_mode = FE_UPWARD; - break; - #endif - - default: - return; - } - - fesetround(fe_mode); - #else - (void) a; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -SIMDE_MM_GET_FLUSH_ZERO_MODE (void) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; - #else - return SIMDE_MM_FLUSH_ZERO_OFF; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_GET_FLUSH_ZERO_MODE(a) SIMDE_MM_GET_FLUSH_ZERO_MODE(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -SIMDE_MM_SET_FLUSH_ZERO_MODE (uint32_t a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _MM_SET_FLUSH_ZERO_MODE(a); - #else - (void) a; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_SET_FLUSH_ZERO_MODE(a) SIMDE_MM_SET_FLUSH_ZERO_MODE(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm_getcsr (void) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_getcsr(); - #else - return SIMDE_MM_GET_ROUNDING_MODE(); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _mm_getcsr() simde_mm_getcsr() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_setcsr (uint32_t a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_setcsr(a); - #else - SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(uint32_t, a & SIMDE_MM_ROUND_MASK)); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _mm_setcsr(a) simde_mm_setcsr(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) - SIMDE_REQUIRE_CONSTANT_RANGE(lax_rounding, 0, 1) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - (void) lax_rounding; - - /* For architectures which lack a current direction SIMD instruction. - * - * Note that NEON actually has a current rounding mode instruction, - * but in ARMv8+ the rounding mode is ignored and nearest is always - * used, so we treat ARMv7 as having a rounding mode but ARMv8 as - * not. */ - #if \ - defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ARM_NEON_A32V8) - if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) - rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; - #endif - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_CUR_DIRECTION: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_round(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) - r_.neon_f32 = vrndiq_f32(a_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_nearest(a_.wasm_v128); - #elif defined(simde_math_nearbyintf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_nearbyintf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEAREST_INT: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_rint(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndnq_f32(a_.neon_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfrintrne_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_nearest(a_.wasm_v128); - #elif defined(simde_math_roundevenf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_roundevenf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEG_INF: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_floor(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndmq_f32(a_.neon_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfrintrm_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_floor(a_.wasm_v128); - #elif defined(simde_math_floorf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_floorf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_POS_INF: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_ceil(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndpq_f32(a_.neon_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfrintrp_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_ceil(a_.wasm_v128); - #elif defined(simde_math_ceilf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_ceilf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_ZERO: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_trunc(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndq_f32(a_.neon_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfrintrz_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_trunc(a_.wasm_v128); - #elif defined(simde_math_truncf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_truncf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_round_ps(a, rounding) _mm_round_ps((a), (rounding)) -#else - #define simde_mm_round_ps(a, rounding) simde_x_mm_round_ps((a), (rounding), 0) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #define _mm_round_ps(a, rounding) simde_mm_round_ps((a), (rounding)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_set_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_set_ps(e3, e2, e1, e0); - #else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_TO_16 simde_float32 data[4] = { e0, e1, e2, e3 }; - r_.neon_f32 = vld1q_f32(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3); - #else - r_.f32[0] = e0; - r_.f32[1] = e1; - r_.f32[2] = e2; - r_.f32[3] = e3; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_set_ps1 (simde_float32 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_set_ps1(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vdupq_n_f32(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - (void) a; - return vec_splats(a); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - return (simde__m128)__lsx_vldrepl_w(&a, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_splat(a); - #else - return simde_mm_set_ps(a, a, a, a); - #endif -} -#define simde_mm_set1_ps(a) simde_mm_set_ps1(a) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_set_ps1(a) simde_mm_set_ps1(a) -# define _mm_set1_ps(a) simde_mm_set1_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_move_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_move_ss(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) m = { ~0U, 0U, 0U, 0U }; - r_.altivec_f32 = vec_sel(a_.altivec_f32, b_.altivec_f32, m); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, b_.lsx_i64, 0); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3); - #else - r_.f32[0] = b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_move_ss(a, b) simde_mm_move_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_broadcastlow_ps(simde__m128 a) { - /* This function broadcasts the first element in the inpu vector to - * all lanes. It is used to avoid generating spurious exceptions in - * *_ss functions since there may be garbage in the upper lanes. */ - - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_shuffle_ps(a, a, 0); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vdupq_laneq_f32(a_.neon_f32, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_splat(a_.altivec_f32, 0); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vreplvei_w(a_.lsx_i64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_splat(a_.f32[0]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[0]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_add_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_add_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfadd_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 + b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] + b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_add_ps(a, b) simde_mm_add_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_add_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_add_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_add_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_add_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t b0 = vgetq_lane_f32(b_.neon_f32, 0); - float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); - // the upper values in the result must be the remnants of . - r_.neon_f32 = vaddq_f32(a_.neon_f32, value); - #else - r_.f32[0] = a_.f32[0] + b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_add_ss(a, b) simde_mm_add_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_and_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_and_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 & b_.i32; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] & b_.i32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_and_ps(a, b) simde_mm_and_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_andnot_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_andnot_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = ~a_.i32 & b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ~(a_.i32[i]) & b_.i32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_xor_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_xor_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vxor_v(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] ^ b_.u32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_or_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_or_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vor_v(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] | b_.u32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_or_ps(a, b) simde_mm_or_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_not_ps(simde__m128 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - __m128i ai = _mm_castps_si128(a); - return _mm_castsi128_ps(_mm_ternarylogic_epi32(ai, ai, ai, 0x55)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* Note: we use ints instead of floats because we don't want cmpeq - * to return false for (NaN, NaN) */ - __m128i ai = _mm_castps_si128(a); - return _mm_castsi128_ps(_mm_andnot_si128(ai, _mm_cmpeq_epi32(ai, ai))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmvnq_s32(a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vnor_v(a_.lsx_i64, a_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = ~a_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ~(a_.i32[i]); - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_select_ps(simde__m128 a, simde__m128 b, simde__m128 mask) { - /* This function is for when you want to blend two elements together - * according to a mask. It is similar to _mm_blendv_ps, except that - * it is undefined whether the blend is based on the highest bit in - * each lane (like blendv) or just bitwise operations. This allows - * us to implement the function efficiently everywhere. - * - * Basically, you promise that all the lanes in mask are either 0 or - * ~0. */ - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_ps(a, b, mask); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - mask_ = simde__m128_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128, mask_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, mask_.altivec_u32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] ^ ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]); - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_avg_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_avg_pu16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) - uint32_t wa SIMDE_VECTOR(16); - uint32_t wb SIMDE_VECTOR(16); - uint32_t wr SIMDE_VECTOR(16); - SIMDE_CONVERT_VECTOR_(wa, a_.u16); - SIMDE_CONVERT_VECTOR_(wb, b_.u16); - wr = (wa + wb + 1) >> 1; - SIMDE_CONVERT_VECTOR_(r_.u16, wr); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b) -# define _m_pavgw(a, b) simde_mm_avg_pu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_avg_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_avg_pu8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) - uint16_t wa SIMDE_VECTOR(16); - uint16_t wb SIMDE_VECTOR(16); - uint16_t wr SIMDE_VECTOR(16); - SIMDE_CONVERT_VECTOR_(wa, a_.u8); - SIMDE_CONVERT_VECTOR_(wb, b_.u8); - wr = (wa + wb + 1) >> 1; - SIMDE_CONVERT_VECTOR_(r_.u8, wr); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b) -# define _m_pavgb(a, b) simde_mm_avg_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_abs_ps(simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - simde_float32 mask_; - uint32_t u32_ = UINT32_C(0x7FFFFFFF); - simde_memcpy(&mask_, &u32_, sizeof(u32_)); - return _mm_and_ps(_mm_set1_ps(mask_), a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vabsq_f32(a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = vec_abs(a_.altivec_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_fabsf(a_.f32[i]); - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpeq_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpeq_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.f32 == b_.f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpeq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpeq_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpeq_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpge_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpge_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpge(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpge_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) - return _mm_cmpge_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpge_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpgt_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpgt_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpgt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) - return _mm_cmpgt_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpgt_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmple_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmple_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmple(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmple_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmple_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmple_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmplt_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmplt_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmplt(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmplt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmplt_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmplt_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpneq_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpneq_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32)); - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_nor(r_.altivec_f32, r_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpneq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpneq_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpneq_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnge_ps (simde__m128 a, simde__m128 b) { - return simde_mm_cmplt_ps(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnge_ss (simde__m128 a, simde__m128 b) { - return simde_mm_cmplt_ss(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpngt_ps (simde__m128 a, simde__m128 b) { - return simde_mm_cmple_ps(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpngt_ss (simde__m128 a, simde__m128 b) { - return simde_mm_cmple_ss(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnle_ps (simde__m128 a, simde__m128 b) { - return simde_mm_cmpgt_ps(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnle_ss (simde__m128 a, simde__m128 b) { - return simde_mm_cmpgt_ss(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnlt_ps (simde__m128 a, simde__m128 b) { - return simde_mm_cmpge_ps(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnlt_ss (simde__m128 a, simde__m128 b) { - return simde_mm_cmpge_ss(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpord_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpord_ps(a, b); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_v128_and(wasm_f32x4_eq(a, a), wasm_f32x4_eq(b, b)); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Note: NEON does not have ordered compare builtin - Need to compare a eq a and b eq b to check for NaN - Do AND of results to get final */ - uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32); - r_.neon_u32 = vandq_u32(ceqaa, ceqbb); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128), wasm_f32x4_eq(b_.wasm_v128, b_.wasm_v128)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); - r_.lsx_i64 = __lsx_vnor_v(r_.lsx_i64, r_.lsx_i64); - #elif defined(simde_math_isnanf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? UINT32_C(0) : ~UINT32_C(0); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpunord_ps(a, b); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_v128_or(wasm_f32x4_ne(a, a), wasm_f32x4_ne(b, b)); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32); - r_.neon_u32 = vmvnq_u32(vandq_u32(ceqaa, ceqbb)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(wasm_f32x4_ne(a_.wasm_v128, a_.wasm_v128), wasm_f32x4_ne(b_.wasm_v128, b_.wasm_v128)); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_nand(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); - r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(simde_math_isnanf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpunord_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) - return _mm_cmpunord_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpunord_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(simde_math_isnanf) - r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comieq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comieq_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] == b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comige_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comige_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] >= b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comigt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comigt_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] > b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comile_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comile_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] <= b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comilt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comilt_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] < b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comineq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comineq_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); - return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] != b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_copysign_ps(simde__m128 dest, simde__m128 src) { - simde__m128_private - r_, - dest_ = simde__m128_to_private(dest), - src_ = simde__m128_to_private(src); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint32x4_t sign_pos = vreinterpretq_u32_f32(vdupq_n_f32(-SIMDE_FLOAT32_C(0.0))); - r_.neon_u32 = vbslq_u32(sign_pos, src_.neon_u32, dest_.neon_u32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t sign_pos = wasm_f32x4_splat(-0.0f); - r_.wasm_v128 = wasm_v128_bitselect(src_.wasm_v128, dest_.wasm_v128, sign_pos); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS) - r_.altivec_f32 = vec_cpsgn(dest_.altivec_f32, src_.altivec_f32); - #else - r_.altivec_f32 = vec_cpsgn(src_.altivec_f32, dest_.altivec_f32); - #endif - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sign_pos = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_splats(-0.0f)); - r_.altivec_f32 = vec_sel(dest_.altivec_f32, src_.altivec_f32, sign_pos); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - const v4f32 sign_pos = {-0.0f, -0.0f, -0.0f, -0.0f}; - r_.lsx_i64 = __lsx_vbitsel_v(dest_.lsx_i64, src_.lsx_i64, (v2i64)sign_pos); - #elif defined(SIMDE_IEEE754_STORAGE) - (void) src_; - (void) dest_; - simde__m128 sign_pos = simde_mm_set1_ps(-0.0f); - r_ = simde__m128_to_private(simde_mm_xor_ps(dest, simde_mm_and_ps(simde_mm_xor_ps(dest, src), sign_pos))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]); - } - #endif - - return simde__m128_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_xorsign_ps(simde__m128 dest, simde__m128 src) { - return simde_mm_xor_ps(simde_mm_and_ps(simde_mm_set1_ps(-0.0f), src), dest); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvt_pi2ps (simde__m128 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvt_pi2ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32); - r_.m64_private[1] = a_.m64_private[1]; - #else - r_.f32[0] = (simde_float32) b_.i32[0]; - r_.f32[1] = (simde_float32) b_.i32[1]; - r_.i32[2] = a_.i32[2]; - r_.i32[3] = a_.i32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvt_ps2pi (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvt_ps2pi(a); - #else - simde__m64_private r_; - simde__m128_private a_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION)); - r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32)); - #elif defined(SIMDE_CONVERT_VECTOR_) && SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_BUG_GCC_100761) - a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION)); - SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32); - #else - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_nearbyintf(a_.f32[i])); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvt_si2ss (simde__m128 a, int32_t b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvt_si2ss(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float, b), a_.neon_f32, 0); - #else - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); - r_.i32[1] = a_.i32[1]; - r_.i32[2] = a_.i32[2]; - r_.i32[3] = a_.i32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvt_ss2si (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvt_ss2si(a); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399) - return vgetq_lane_s32(vcvtnq_s32_f32(simde__m128_to_neon_f32(a)), 0); - #else - simde__m128_private a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION)); - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - return ((a_.f32[0] > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && - (a_.f32[0] < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]) : INT32_MIN; - #else - return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpi16_ps (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi16_ps(a); - #else - simde__m128_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(a_.neon_i16)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - simde_float32 v = a_.i16[i]; - r_.f32[i] = v; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpi32_ps (simde__m128 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi32_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32); - r_.m64_private[1] = a_.m64_private[1]; - #else - r_.f32[0] = (simde_float32) b_.i32[0]; - r_.f32[1] = (simde_float32) b_.i32[1]; - r_.i32[2] = a_.i32[2]; - r_.i32[3] = a_.i32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpi32x2_ps (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi32x2_ps(a, b); - #else - simde__m128_private r_; - simde__m64_private - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.i32); - SIMDE_CONVERT_VECTOR_(r_.m64_private[1].f32, b_.i32); - #else - r_.f32[0] = (simde_float32) a_.i32[0]; - r_.f32[1] = (simde_float32) a_.i32[1]; - r_.f32[2] = (simde_float32) b_.i32[0]; - r_.f32[3] = (simde_float32) b_.i32[1]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpi8_ps (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi8_ps(a); - #else - simde__m128_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8)))); - #else - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[0]); - r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[1]); - r_.f32[2] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[2]); - r_.f32[3] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[3]); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtps_pi16 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtps_pi16(a); - #else - simde__m64_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) - r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(vrndiq_f32(a_.neon_f32))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t, simde_math_roundf(a_.f32[i])); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtps_pi32 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtps_pi32(a); - #else - simde__m64_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399) - r_.neon_i32 = vcvt_s32_f32(vget_low_f32(vrndiq_f32(a_.neon_f32))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float32 v = simde_math_roundf(a_.f32[i]); - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #else - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #endif - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtps_pi8 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtps_pi8(a); - #else - simde__m64_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95471) - /* Clamp the input to [INT8_MIN, INT8_MAX], round, convert to i32, narrow to - * i16, combine with an all-zero vector of i16 (which will become the upper - * half), narrow to i8. */ - float32x4_t max = vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MAX)); - float32x4_t min = vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MIN)); - float32x4_t values = vrndnq_f32(vmaxq_f32(vminq_f32(max, a_.neon_f32), min)); - r_.neon_i8 = vmovn_s16(vcombine_s16(vmovn_s32(vcvtq_s32_f32(values)), vdup_n_s16(0))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if (a_.f32[i] > HEDLEY_STATIC_CAST(simde_float32, INT8_MAX)) - r_.i8[i] = INT8_MAX; - else if (a_.f32[i] < HEDLEY_STATIC_CAST(simde_float32, INT8_MIN)) - r_.i8[i] = INT8_MIN; - else - r_.i8[i] = SIMDE_CONVERT_FTOI(int8_t, simde_math_roundf(a_.f32[i])); - } - /* Note: the upper half is undefined */ - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpu16_ps (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpu16_ps(a); - #else - simde__m128_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (simde_float32) a_.u16[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpu8_ps (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpu8_ps(a); - #else - simde__m128_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8)))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.u8[i]); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtsi32_ss (simde__m128 a, int32_t b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvtsi32_ss(a, b); - #else - simde__m128_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0); - #else - r_ = a_; - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtsi64_ss (simde__m128 a, int64_t b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvtsi64_ss(a, b); - #else - return _mm_cvtsi64x_ss(a, b); - #endif - #else - simde__m128_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0); - #else - r_ = a_; - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float32 -simde_mm_cvtss_f32 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvtss_f32(a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vgetq_lane_f32(a_.neon_f32, 0); - #else - return a_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtss_si32 (simde__m128 a) { - return simde_mm_cvt_ss2si(a); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvtss_si64 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvtss_si64(a); - #else - return _mm_cvtss_si64x(a); - #endif - #else - simde__m128_private a_ = simde__m128_to_private(a); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(vgetq_lane_f32(a_.neon_f32, 0))); - #else - return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(a_.f32[0])); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtt_ps2pi (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtt_ps2pi(a); - #else - simde__m64_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) - r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - simde_float32 v = a_.f32[i]; - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #else - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #endif - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a)) -# define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtt_ss2si (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvtt_ss2si(a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) - return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0)); - #else - simde_float32 v = a_.f32[0]; - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - return ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #else - return SIMDE_CONVERT_FTOI(int32_t, v); - #endif - #endif - #endif -} -#define simde_mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a)) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a)) -# define _mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvttss_si64 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(_MSC_VER) - #if defined(__PGI) - return _mm_cvttss_si64x(a); - #else - return _mm_cvttss_si64(a); - #endif - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0)); - #else - return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpord_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpord_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpord_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(simde_math_isnanf) - r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) || simde_math_isnanf(simde_mm_cvtss_f32(b))) ? UINT32_C(0) : ~UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_div_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_div_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4_t recip0 = vrecpeq_f32(b_.neon_f32); - float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32)); - r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_div(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - r_.lsx_f32 = __lsx_vfdiv_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 / b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] / b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_div_ps(a, b) simde_mm_div_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_div_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_div_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_div_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_div_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t value = - vgetq_lane_f32(simde__m128_to_private(simde_mm_div_ps(a, b)).neon_f32, 0); - r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); - #else - r_.f32[0] = a_.f32[0] / b_.f32[0]; - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_div_ss(a, b) simde_mm_div_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int16_t -simde_mm_extract_pi16 (simde__m64 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m64_private a_ = simde__m64_to_private(a); - return a_.i16[imm8]; -} -#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(HEDLEY_PGI_VERSION) && !defined(SIMDE_BUG_CLANG_44589) - #define simde_mm_extract_pi16(a, imm8) HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16(a, imm8)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_extract_pi16(a, imm8) vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8) -#endif -#define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), (imm8)) -# define _m_pextrw(a, imm8) simde_mm_extract_pi16((a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_insert_pi16 (simde__m64 a, int16_t i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m64_private - a_ = simde__m64_to_private(a); - - a_.i16[imm8] = i; - - return simde__m64_from_private(a_); -} -#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) && !defined(SIMDE_BUG_CLANG_44589) - #define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_insert_pi16(a, i, imm8) simde__m64_from_neon_i16(vset_lane_s16((i), simde__m64_to_neon_i16(a), (imm8))) -#endif -#define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8)) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8) -# define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_load_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { -#if defined(SIMDE_X86_SSE_NATIVE) - return _mm_load_ps(mem_addr); -#else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vld1q_f32(mem_addr); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_vsx_ld(0, mem_addr); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_ld(0, mem_addr); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vld(mem_addr, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load(mem_addr); - #else - simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128), sizeof(r_)); - #endif - - return simde__m128_from_private(r_); -#endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_load1_ps (simde_float32 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_load_ps1(mem_addr); - #else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vld1q_dup_f32(mem_addr); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vldrepl_w(mem_addr, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load32_splat(mem_addr); - #else - r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr)); - #endif - - return simde__m128_from_private(r_); - #endif -} -#define simde_mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr) -# define _mm_load1_ps(mem_addr) simde_mm_load1_ps(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_load_ss (simde_float32 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_load_ss(mem_addr); - #else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load32_zero(mem_addr); - #else - r_.f32[0] = *mem_addr; - r_.i32[1] = 0; - r_.i32[2] = 0; - r_.i32[3] = 0; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_loadh_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr)); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcombine_f32(vget_low_f32(a_.neon_f32), vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 1); - #else - simde__m64_private b_ = *HEDLEY_REINTERPRET_CAST(simde__m64_private const*, mem_addr); - r_.f32[0] = a_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = b_.f32[0]; - r_.f32[3] = b_.f32[1]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #if HEDLEY_HAS_WARNING("-Wold-style-cast") - #define _mm_loadh_pi(a, mem_addr) simde_mm_loadh_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const*, (mem_addr))) - #else - #define _mm_loadh_pi(a, mem_addr) simde_mm_loadh_pi((a), (simde__m64 const*) (mem_addr)) - #endif -#endif - -/* The SSE documentation says that there are no alignment requirements - for mem_addr. Unfortunately they used the __m64 type for the argument - which is supposed to be 8-byte aligned, so some compilers (like clang - with -Wcast-align) will generate a warning if you try to cast, say, - a simde_float32* to a simde__m64* for this function. - - I think the choice of argument type is unfortunate, but I do think we - need to stick to it here. If there is demand I can always add something - like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */ -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_loadl_pi (simde__m128 a, simde__m64 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_loadl_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr)); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcombine_f32(vld1_f32( - HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)), vget_high_f32(a_.neon_f32)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 0); - #else - simde__m64_private b_; - simde_memcpy(&b_, mem_addr, sizeof(b_)); - r_.i32[0] = b_.i32[0]; - r_.i32[1] = b_.i32[1]; - r_.i32[2] = a_.i32[2]; - r_.i32[3] = a_.i32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #if HEDLEY_HAS_WARNING("-Wold-style-cast") - #define _mm_loadl_pi(a, mem_addr) simde_mm_loadl_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const*, (mem_addr))) - #else - #define _mm_loadl_pi(a, mem_addr) simde_mm_loadl_pi((a), (simde__m64 const*) (mem_addr)) - #endif -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_loadr_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_loadr_ps(mem_addr); - #else - simde__m128_private - r_, - v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr)); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vrev64q_f32(v_.neon_f32); - r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__) - r_.altivec_f32 = vec_reve(v_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vshuf4i_w(v_.lsx_i64, 0x1b); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0); - #else - r_.f32[0] = v_.f32[3]; - r_.f32[1] = v_.f32[2]; - r_.f32[2] = v_.f32[1]; - r_.f32[3] = v_.f32[0]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_loadu_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_loadu_ps(mem_addr); - #else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load(mem_addr); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__) - r_.altivec_f32 = vec_vsx_ld(0, mem_addr); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vld(mem_addr, 0); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskmove_si64 (simde__m64 a, simde__m64 mask, int8_t* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr)); - #else - simde__m64_private - a_ = simde__m64_to_private(a), - mask_ = simde__m64_to_private(mask); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) - if (mask_.i8[i] < 0) - mem_addr[i] = a_.i8[i]; - #endif -} -#define simde_m_maskmovq(a, mask, mem_addr) simde_mm_maskmove_si64(a, mask, mem_addr) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_maskmove_si64(a, mask, mem_addr) simde_mm_maskmove_si64((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr))) -# define _m_maskmovq(a, mask, mem_addr) simde_mm_maskmove_si64((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_max_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_max_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b) -# define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_max_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_max_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_NANS) - r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vbslq_f32(vcgtq_f32(a_.neon_f32, b_.neon_f32), a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS) - r_.wasm_v128 = wasm_f32x4_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128)); - #elif (defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)) && defined(SIMDE_FAST_NANS) - r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) - r_.lsx_f32 = __lsx_vfmax_s(a_.lsx_f32, b_.lsx_f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_max_ps(a, b) simde_mm_max_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_max_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_max_pu8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b) -# define _m_pmaxub(a, b) simde_mm_max_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_max_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_max_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_max_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_max_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t value = vgetq_lane_f32(maxq_f32(a_.neon_f32, b_.neon_f32), 0); - r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); - #else - r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_max_ss(a, b) simde_mm_max_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_min_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_min_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b) -# define _m_pminsw(a, b) simde_mm_min_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_min_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_min_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_pmin(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - #if defined(SIMDE_FAST_NANS) - r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32); - #else - r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(b_.altivec_f32, a_.altivec_f32)); - #endif - #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfmin_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - uint32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 < b_.f32); - r_.f32 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f32), - ( (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32) & m) | - (HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f32) & ~m) - ) - ); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_min_ps(a, b) simde_mm_min_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_min_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_min_pu8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pminub(a, b) simde_mm_min_pu8(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b) -# define _m_pminub(a, b) simde_mm_min_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_min_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_min_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_min_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_min_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t value = vgetq_lane_f32(vminq_f32(a_.neon_f32, b_.neon_f32), 0); - r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); - #else - r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_min_ss(a, b) simde_mm_min_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_movehl_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_movehl_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vzip2q_u64(b_.neon_u64, a_.neon_u64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x2_t a32 = vget_high_f32(a_.neon_f32); - float32x2_t b32 = vget_high_f32(b_.neon_f32); - r_.neon_f32 = vcombine_f32(b32, a32); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_mergel(b_.altivec_i64, a_.altivec_i64)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vilvh_d(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3); - #else - r_.f32[0] = b_.f32[2]; - r_.f32[1] = b_.f32[3]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_movelh_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_movelh_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x2_t a10 = vget_low_f32(a_.neon_f32); - float32x2_t b10 = vget_low_f32(b_.neon_f32); - r_.neon_f32 = vcombine_f32(a10, b10); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_mergeh(a_.altivec_i64, b_.altivec_i64)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vilvl_d(b_.lsx_i64, a_.lsx_i64); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5); - #else - r_.f32[0] = a_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = b_.f32[0]; - r_.f32[3] = b_.f32[1]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_movemask_pi8 (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_movemask_pi8(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - int r = 0; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x8_t input = a_.neon_u8; - const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0}; - const uint8x8_t mask_and = vdup_n_u8(0x80); - const int8x8_t mask_shift = vld1_s8(xr); - const uint8x8_t mask_result = vshl_u8(vand_u8(input, mask_and), mask_shift); - uint8x8_t lo = mask_result; - r = vaddv_u8(lo); - #else - const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]); - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < nmemb ; i++) { - r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i); - } - #endif - - return r; - #endif -} -#define simde_m_pmovmskb(a) simde_mm_movemask_pi8(a) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a) -# define _m_pmovmskb(a) simde_mm_movemask_pi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_movemask_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_movemask_ps(a); - #else - int r = 0; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - static const int32_t shift[4] = {0, 1, 2, 3}; - uint32x4_t tmp = vshrq_n_u32(a_.neon_u32, 31); - return HEDLEY_STATIC_CAST(int32_t, vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - // Shift out everything but the sign bits with a 32-bit unsigned shift right. - uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(a_.neon_u32, 31)); - // Merge the two pairs together with a 64-bit unsigned shift right + add. - uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); - // Extract the result. - return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && defined(SIMDE_BUG_CLANG_50932) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_bperm(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a_.altivec_u64), idx)); - return HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = vec_bperm(a_.altivec_u8, idx); - return HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - v2i64 t64 = __lsx_vmskltz_w(a_.lsx_i64); - r = __lsx_vpickve2gr_wu(t64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_bitmask(a_.wasm_v128)); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < sizeof(a_.u32) / sizeof(a_.u32[0]) ; i++) { - r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_movemask_ps(a) simde_mm_movemask_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mul_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_mul_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 * b_.f32; - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_mul(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfmul_s(a_.lsx_f32, b_.lsx_f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] * b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mul_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_mul_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_mul_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_mul_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.f32[0] = a_.f32[0] * b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mulhi_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_mulhi_pu16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint32x4_t t1 = vmull_u16(a_.neon_u16, b_.neon_u16); - const uint32x4_t t2 = vshrq_n_u32(t1, 16); - const uint16x4_t t3 = vmovn_u32(t2); - r_.neon_u16 = t3; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >> UINT32_C(16))); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b) -# define _m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b) -#endif - -#if defined(SIMDE_X86_SSE_NATIVE) && defined(HEDLEY_GCC_VERSION) - #define SIMDE_MM_HINT_NTA HEDLEY_STATIC_CAST(enum _mm_hint, 0) - #define SIMDE_MM_HINT_T0 HEDLEY_STATIC_CAST(enum _mm_hint, 1) - #define SIMDE_MM_HINT_T1 HEDLEY_STATIC_CAST(enum _mm_hint, 2) - #define SIMDE_MM_HINT_T2 HEDLEY_STATIC_CAST(enum _mm_hint, 3) - #define SIMDE_MM_HINT_ENTA HEDLEY_STATIC_CAST(enum _mm_hint, 4) - #define SIMDE_MM_HINT_ET0 HEDLEY_STATIC_CAST(enum _mm_hint, 5) - #define SIMDE_MM_HINT_ET1 HEDLEY_STATIC_CAST(enum _mm_hint, 6) - #define SIMDE_MM_HINT_ET2 HEDLEY_STATIC_CAST(enum _mm_hint, 7) -#else - #define SIMDE_MM_HINT_NTA 0 - #define SIMDE_MM_HINT_T0 1 - #define SIMDE_MM_HINT_T1 2 - #define SIMDE_MM_HINT_T2 3 - #define SIMDE_MM_HINT_ENTA 4 - #define SIMDE_MM_HINT_ET0 5 - #define SIMDE_MM_HINT_ET1 6 - #define SIMDE_MM_HINT_ET2 7 -#endif - -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wreserved-id-macro") - _Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"") - #endif - #undef _MM_HINT_NTA - #define _MM_HINT_NTA SIMDE_MM_HINT_NTA - #undef _MM_HINT_T0 - #define _MM_HINT_T0 SIMDE_MM_HINT_T0 - #undef _MM_HINT_T1 - #define _MM_HINT_T1 SIMDE_MM_HINT_T1 - #undef _MM_HINT_T2 - #define _MM_HINT_T2 SIMDE_MM_HINT_T2 - #undef _MM_HINT_ENTA - #define _MM_HINT_ETNA SIMDE_MM_HINT_ENTA - #undef _MM_HINT_ET0 - #define _MM_HINT_ET0 SIMDE_MM_HINT_ET0 - #undef _MM_HINT_ET1 - #define _MM_HINT_ET1 SIMDE_MM_HINT_ET1 - #undef _MM_HINT_ET1 - #define _MM_HINT_ET2 SIMDE_MM_HINT_ET2 - HEDLEY_DIAGNOSTIC_POP -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_prefetch (const void* p, int i) { - #if \ - HEDLEY_HAS_BUILTIN(__builtin_prefetch) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) - switch(i) { - case SIMDE_MM_HINT_NTA: - __builtin_prefetch(p, 0, 0); - break; - case SIMDE_MM_HINT_T0: - __builtin_prefetch(p, 0, 3); - break; - case SIMDE_MM_HINT_T1: - __builtin_prefetch(p, 0, 2); - break; - case SIMDE_MM_HINT_T2: - __builtin_prefetch(p, 0, 1); - break; - case SIMDE_MM_HINT_ENTA: - __builtin_prefetch(p, 1, 0); - break; - case SIMDE_MM_HINT_ET0: - __builtin_prefetch(p, 1, 3); - break; - case SIMDE_MM_HINT_ET1: - __builtin_prefetch(p, 1, 2); - break; - case SIMDE_MM_HINT_ET2: - __builtin_prefetch(p, 0, 1); - break; - } - #elif defined(__ARM_ACLE) - #if (__ARM_ACLE >= 101) - switch(i) { - case SIMDE_MM_HINT_NTA: - __pldx(0, 0, 1, p); - break; - case SIMDE_MM_HINT_T0: - __pldx(0, 0, 0, p); - break; - case SIMDE_MM_HINT_T1: - __pldx(0, 1, 0, p); - break; - case SIMDE_MM_HINT_T2: - __pldx(0, 2, 0, p); - break; - case SIMDE_MM_HINT_ENTA: - __pldx(1, 0, 1, p); - break; - case SIMDE_MM_HINT_ET0: - __pldx(1, 0, 0, p); - break; - case SIMDE_MM_HINT_ET1: - __pldx(1, 1, 0, p); - break; - case SIMDE_MM_HINT_ET2: - __pldx(1, 2, 0, p); - break; - } - #else - (void) i; - __pld(p) - #endif - #elif HEDLEY_PGI_VERSION_CHECK(10,0,0) - (void) i; - #pragma mem prefetch p - #elif HEDLEY_CRAY_VERSION_CHECK(8,1,0) - switch (i) { - case SIMDE_MM_HINT_NTA: - #pragma _CRI prefetch (nt) p - break; - case SIMDE_MM_HINT_T0: - case SIMDE_MM_HINT_T1: - case SIMDE_MM_HINT_T2: - #pragma _CRI prefetch p - break; - case SIMDE_MM_HINT_ENTA: - #pragma _CRI prefetch (write, nt) p - break; - case SIMDE_MM_HINT_ET0: - case SIMDE_MM_HINT_ET1: - case SIMDE_MM_HINT_ET2: - #pragma _CRI prefetch (write) p - break; - } - #elif HEDLEY_IBM_VERSION_CHECK(11,0,0) - switch(i) { - case SIMDE_MM_HINT_NTA: - __prefetch_by_load(p, 0, 0); - break; - case SIMDE_MM_HINT_T0: - __prefetch_by_load(p, 0, 3); - break; - case SIMDE_MM_HINT_T1: - __prefetch_by_load(p, 0, 2); - break; - case SIMDE_MM_HINT_T2: - __prefetch_by_load(p, 0, 1); - break; - case SIMDE_MM_HINT_ENTA: - __prefetch_by_load(p, 1, 0); - break; - case SIMDE_MM_HINT_ET0: - __prefetch_by_load(p, 1, 3); - break; - case SIMDE_MM_HINT_ET1: - __prefetch_by_load(p, 1, 2); - break; - case SIMDE_MM_HINT_ET2: - __prefetch_by_load(p, 0, 1); - break; - } - #elif HEDLEY_MSVC_VERSION - (void) i; - (void) p; - #endif -} -#if defined(SIMDE_X86_SSE_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) /* https://reviews.llvm.org/D71718 */ - #define simde_mm_prefetch(p, i) \ - (__extension__({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ - _mm_prefetch((p), (i)); \ - HEDLEY_DIAGNOSTIC_POP \ - })) - #else - #define simde_mm_prefetch(p, i) _mm_prefetch(p, i) - #endif -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _mm_prefetch(p, i) simde_mm_prefetch(p, i) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_negate_ps(simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return simde_mm_xor_ps(a, _mm_set1_ps(SIMDE_FLOAT32_C(-0.0))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vnegq_f32(a_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_neg(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f32 = vec_neg(a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - const v4f32 f32 = {0.0f, 0.0f, 0.0f, 0.0f}; - r_.lsx_f32 = __lsx_vfsub_s(f32, a_.lsx_f32); - #elif defined(SIMDE_VECTOR_NEGATE) - r_.f32 = -a_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -a_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_rcp_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_rcp_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4_t recip = vrecpeq_f32(a_.neon_f32); - - #if SIMDE_ACCURACY_PREFERENCE > 0 - for (int i = 0; i < SIMDE_ACCURACY_PREFERENCE ; ++i) { - recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32)); - } - #endif - - r_.neon_f32 = recip; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_re(a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfrecip_s(a_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.f32 = 1.0f / a_.f32; - #elif defined(SIMDE_IEEE754_STORAGE) - /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */ - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - int32_t ix; - simde_float32 fx = a_.f32[i]; - simde_memcpy(&ix, &fx, sizeof(ix)); - int32_t x = INT32_C(0x7EF311C3) - ix; - simde_float32 temp; - simde_memcpy(&temp, &x, sizeof(temp)); - r_.f32[i] = temp * (SIMDE_FLOAT32_C(2.0) - temp * fx); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = 1.0f / a_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_rcp_ps(a) simde_mm_rcp_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_rcp_ss (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_rcp_ss(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_rcp_ps(a)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_rcp_ps(simde_x_mm_broadcastlow_ps(a))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - r_.f32[0] = 1.0f / a_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_rcp_ss(a) simde_mm_rcp_ss((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_rsqrt_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_rsqrt_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vrsqrteq_f32(a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_rsqrte(a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfrsqrt_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), wasm_f32x4_sqrt(a_.wasm_v128)); - #elif defined(SIMDE_IEEE754_STORAGE) - /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf - Pages 100 - 103 */ - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - #if SIMDE_ACCURACY_PREFERENCE <= 0 - r_.i32[i] = INT32_C(0x5F37624F) - (a_.i32[i] >> 1); - #else - simde_float32 x = a_.f32[i]; - simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x; - int32_t ix; - - simde_memcpy(&ix, &x, sizeof(ix)); - - #if SIMDE_ACCURACY_PREFERENCE == 1 - ix = INT32_C(0x5F375A82) - (ix >> 1); - #else - ix = INT32_C(0x5F37599E) - (ix >> 1); - #endif - - simde_memcpy(&x, &ix, sizeof(x)); - - #if SIMDE_ACCURACY_PREFERENCE >= 2 - x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); - #endif - x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); - - r_.f32[i] = x; - #endif - } - #elif defined(simde_math_sqrtf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_rsqrt_ss (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_rsqrt_ss(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_rsqrt_ps(simde_x_mm_broadcastlow_ps(a))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(simde_mm_rsqrt_ps(a).neon_f32, 0), a_.neon_f32, 0); - #elif defined(SIMDE_IEEE754_STORAGE) - { - #if SIMDE_ACCURACY_PREFERENCE <= 0 - r_.i32[0] = INT32_C(0x5F37624F) - (a_.i32[0] >> 1); - #else - simde_float32 x = a_.f32[0]; - simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x; - int32_t ix; - - simde_memcpy(&ix, &x, sizeof(ix)); - - #if SIMDE_ACCURACY_PREFERENCE == 1 - ix = INT32_C(0x5F375A82) - (ix >> 1); - #else - ix = INT32_C(0x5F37599E) - (ix >> 1); - #endif - - simde_memcpy(&x, &ix, sizeof(x)); - - #if SIMDE_ACCURACY_PREFERENCE >= 2 - x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); - #endif - x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); - - r_.f32[0] = x; - #endif - } - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #elif defined(simde_math_sqrtf) - r_.f32[0] = 1.0f / simde_math_sqrtf(a_.f32[0]); - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sad_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sad_pu8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint64x1_t t = vpaddl_u32(vpaddl_u16(vpaddl_u8(vabd_u8(a_.neon_u8, b_.neon_u8)))); - r_.neon_u16 = vset_lane_u16(HEDLEY_STATIC_CAST(uint64_t, vget_lane_u64(t, 0)), vdup_n_u16(0), 0); - #else - uint16_t sum = 0; - - SIMDE_VECTORIZE_REDUCTION(+:sum) - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - sum += HEDLEY_STATIC_CAST(uint8_t, simde_math_abs(a_.u8[i] - b_.u8[i])); - } - - r_.i16[0] = HEDLEY_STATIC_CAST(int16_t, sum); - r_.i16[1] = 0; - r_.i16[2] = 0; - r_.i16[3] = 0; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b) -# define _m_psadbw(a, b) simde_mm_sad_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_set_ss (simde_float32 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_set_ss(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0); - #else - return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), a); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_set_ss(a) simde_mm_set_ss(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_setr_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_setr_ps(e3, e2, e1, e0); - #else - return simde_mm_set_ps(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_setzero_ps (void) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_setzero_ps(); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vdupq_n_f32(SIMDE_FLOAT32_C(0.0)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return vec_splats(SIMDE_FLOAT32_C(0.0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_const(0.f, 0.f, 0.f, 0.f); - #else - simde__m128 r; - simde_memset(&r, 0, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_setzero_ps() simde_mm_setzero_ps() -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_undefined_ps (void) { - simde__m128_private r_; - - #if defined(SIMDE_HAVE_UNDEFINED128) - r_.n = _mm_undefined_ps(); - #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - #endif - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_undefined_ps() simde_mm_undefined_ps() -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) -HEDLEY_DIAGNOSTIC_POP -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_setone_ps (void) { - simde__m128 t = simde_mm_setzero_ps(); - return simde_mm_cmpeq_ps(t, t); -} - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_sfence (void) { - /* TODO: Use Hedley. */ - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_sfence(); - #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) - __atomic_thread_fence(__ATOMIC_SEQ_CST); - #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) - #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9) - __atomic_thread_fence(__ATOMIC_SEQ_CST); - #else - atomic_thread_fence(memory_order_seq_cst); - #endif - #elif defined(_MSC_VER) - MemoryBarrier(); - #elif HEDLEY_HAS_EXTENSION(c_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); - #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) - __sync_synchronize(); - #elif defined(_OPENMP) - #pragma omp critical(simde_mm_sfence_) - { } - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sfence() simde_mm_sfence() -#endif - -#define SIMDE_MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w) -#endif - -#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) -# define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) -# define simde_mm_shuffle_pi16(a, imm8) (__extension__ ({ \ - const simde__m64_private simde_tmp_a_ = simde__m64_to_private(a); \ - simde__m64_from_private((simde__m64_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 8, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3)) }); })) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_shuffle_pi16 (simde__m64 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - for (size_t i = 0 ; i < sizeof(r_.i16) / sizeof(r_.i16[0]) ; i++) { - r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3]; - } - -HEDLEY_DIAGNOSTIC_PUSH -#if HEDLEY_HAS_WARNING("-Wconditional-uninitialized") -# pragma clang diagnostic ignored "-Wconditional-uninitialized" -#endif - return simde__m64_from_private(r_); -HEDLEY_DIAGNOSTIC_POP -} -#endif -#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) -# define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8) -#else -# define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8) -# define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.f32[0] = a_.f32[(imm8 >> 0) & 3]; - r_.f32[1] = a_.f32[(imm8 >> 2) & 3]; - r_.f32[2] = b_.f32[(imm8 >> 4) & 3]; - r_.f32[3] = b_.f32[(imm8 >> 6) & 3]; - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) -# define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ - simde__m128_from_private((simde__m128_private) { .wasm_v128 = \ - wasm_i32x4_shuffle( \ - simde__m128_to_private(a).wasm_v128, \ - simde__m128_to_private(b).wasm_v128, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_shuffle_ps(a, b, imm8) \ - (__extension__({ \ - float32x4_t simde_mm_shuffle_ps_a_ = simde__m128_to_neon_f32(a); \ - float32x4_t simde_mm_shuffle_ps_b_ = simde__m128_to_neon_f32(b); \ - float32x4_t simde_mm_shuffle_ps_r_; \ - \ - simde_mm_shuffle_ps_r_ = vmovq_n_f32(vgetq_lane_f32(simde_mm_shuffle_ps_a_, (imm8) & (0x3))); \ - simde_mm_shuffle_ps_r_ = vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_a_, ((imm8) >> 2) & 0x3), simde_mm_shuffle_ps_r_, 1); \ - simde_mm_shuffle_ps_r_ = vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 4) & 0x3), simde_mm_shuffle_ps_r_, 2); \ - vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 6) & 0x3), simde_mm_shuffle_ps_r_, 3); \ - })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ - simde__m128_from_private((simde__m128_private) { .f32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 16, \ - simde__m128_to_private(a).f32, \ - simde__m128_to_private(b).f32, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sqrt_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_sqrt_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vsqrtq_f32(a_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4_t est = vrsqrteq_f32(a_.neon_f32); - for (int i = 0 ; i <= SIMDE_ACCURACY_PREFERENCE ; i++) { - est = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a_.neon_f32, est), est), est); - } - r_.neon_f32 = vmulq_f32(a_.neon_f32, est); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_sqrt(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = vec_sqrt(a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfsqrt_s(a_.lsx_f32); - #elif defined(simde_math_sqrt) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < sizeof(r_.f32) / sizeof(r_.f32[0]) ; i++) { - r_.f32[i] = simde_math_sqrtf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sqrt_ss (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_sqrt_ss(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_sqrt_ps(a)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_sqrt_ps(simde_x_mm_broadcastlow_ps(a))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t value = - vgetq_lane_f32(simde__m128_to_private(simde_mm_sqrt_ps(a)).neon_f32, 0); - r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); - #elif defined(simde_math_sqrtf) - r_.f32[0] = simde_math_sqrtf(a_.f32[0]); - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_ps (simde_float32 mem_addr[4], simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_store_ps(mem_addr, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_f32(mem_addr, a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - vec_st(a_.altivec_f32, 0, mem_addr); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store(mem_addr, a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vst(a_.lsx_f32, mem_addr, 0); - #else - simde_memcpy(mem_addr, &a_, sizeof(a)); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_store_ps(mem_addr, a) simde_mm_store_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store1_ps (simde_float32 mem_addr[4], simde__m128 a) { - simde_float32* mem_addr_ = SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128); - - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_store_ps1(mem_addr_, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_f32(mem_addr_, vdupq_lane_f32(vget_low_f32(a_.neon_f32), 0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store(mem_addr_, wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 0, 0)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - vec_st(vec_splat(a_.altivec_f32, 0), 0, mem_addr_); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vst(__lsx_vreplvei_w(a_.lsx_f32, 0), mem_addr_, 0); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - simde__m128_private tmp_; - tmp_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); - simde_mm_store_ps(mem_addr_, tmp_.f32); - #else - SIMDE_VECTORIZE_ALIGNED(mem_addr_:16) - for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) { - mem_addr_[i] = a_.f32[0]; - } - #endif - #endif -} -#define simde_mm_store_ps1(mem_addr, a) simde_mm_store1_ps(mem_addr, a) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_store_ps1(mem_addr, a) simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -# define _mm_store1_ps(mem_addr, a) simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_ss (simde_float32* mem_addr, simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_store_ss(mem_addr, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_lane_f32(mem_addr, a_.neon_f32, 0); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vstelm_w(a_.lsx_f32, mem_addr, 0, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store32_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); - #else - *mem_addr = a_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_store_ss(mem_addr, a) simde_mm_store_ss(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeh_pi (simde__m64* mem_addr, simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1_f32(HEDLEY_REINTERPRET_CAST(float32_t*, mem_addr), vget_high_f32(a_.neon_f32)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); - #else - simde_memcpy(mem_addr, &(a_.m64[1]), sizeof(a_.m64[1])); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storel_pi (simde__m64* mem_addr, simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - simde__m128_private a_ = simde__m128_to_private(a); - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); - #else - simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr); - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - dest_->neon_f32 = vget_low_f32(a_.neon_f32); - #else - dest_->f32[0] = a_.f32[0]; - dest_->f32[1] = a_.f32[1]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storer_ps (simde_float32 mem_addr[4], simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_storer_ps(mem_addr, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - vec_st(vec_reve(a_.altivec_f32), 0, mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4_t tmp = vrev64q_f32(a_.neon_f32); - vst1q_f32(mem_addr, vextq_f32(tmp, tmp, 2)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vst(__lsx_vshuf4i_w(a_.lsx_f32, 0x1b), mem_addr, 0); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0); - simde_mm_store_ps(mem_addr, simde__m128_from_private(a_)); - #else - SIMDE_VECTORIZE_ALIGNED(mem_addr:16) - for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) { - mem_addr[i] = a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i]; - } - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_storer_ps(mem_addr, a) simde_mm_storer_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_ps (simde_float32 mem_addr[4], simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_storeu_ps(mem_addr, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_f32(mem_addr, a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - vec_vsx_st(a_.altivec_f32, 0, mem_addr); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vst(a_.lsx_f32, mem_addr, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store(mem_addr, a_.wasm_v128); - #else - simde_memcpy(mem_addr, &a_, sizeof(a_)); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_storeu_ps(mem_addr, a) simde_mm_storeu_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sub_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_sub_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_sub(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfsub_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 - b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] - b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sub_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_sub_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_sub_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_sub_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.f32[0] = a_.f32[0] - b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - - return simde__m128_from_private(r_); - #endif -} - -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomieq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomieq_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] == b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] == b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomige_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomige_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] >= b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] >= b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomigt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomigt_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] > b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] > b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomile_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomile_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] <= b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] <= b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomilt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomilt_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] < b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] < b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomineq_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); - r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] != b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] != b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b)) -#endif - -#if defined(SIMDE_X86_SSE_NATIVE) -# if defined(__has_builtin) -# if __has_builtin(__builtin_ia32_undef128) -# define SIMDE_HAVE_UNDEFINED128 -# endif -# elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && !defined(_MSC_VER) -# define SIMDE_HAVE_UNDEFINED128 -# endif -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_unpackhi_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vzip2q_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x2_t a1 = vget_high_f32(a_.neon_f32); - float32x2_t b1 = vget_high_f32(b_.neon_f32); - float32x2x2_t result = vzip_f32(a1, b1); - r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vilvh_w(b_.lsx_i64, a_.lsx_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7); - #else - r_.f32[0] = a_.f32[2]; - r_.f32[1] = b_.f32[2]; - r_.f32[2] = a_.f32[3]; - r_.f32[3] = b_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_unpacklo_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_unpacklo_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vzip1q_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_mergeh(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vilvl_w(b_.lsx_i64, a_.lsx_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x2_t a1 = vget_low_f32(a_.neon_f32); - float32x2_t b1 = vget_low_f32(b_.neon_f32); - float32x2x2_t result = vzip_f32(a1, b1); - r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5); - #else - r_.f32[0] = a_.f32[0]; - r_.f32[1] = b_.f32[0]; - r_.f32[2] = a_.f32[1]; - r_.f32[3] = b_.f32[1]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) || \ - defined(SIMDE_VECTOR_SUBSCRIPT)) - __builtin_nontemporal_store(a, mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private a_ = simde__m64_to_private(a); - vst1_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), a_.neon_i64); - #else - simde__m64_private* - dest = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr), - a_ = simde__m64_to_private(a); - - dest->i64[0] = a_.i64[0]; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_stream_ps(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) - __builtin_nontemporal_store(a, SIMDE_ALIGN_ASSUME_CAST(__typeof__(a)*, mem_addr)); - #else - simde_mm_store_ps(mem_addr, a); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_stream_ps(mem_addr, a) simde_mm_stream_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ - do { \ - float32x4x2_t SIMDE_MM_TRANSPOSE4_PS_ROW01 = vtrnq_f32(row0, row1); \ - float32x4x2_t SIMDE_MM_TRANSPOSE4_PS_ROW23 = vtrnq_f32(row2, row3); \ - row0 = vcombine_f32(vget_low_f32(SIMDE_MM_TRANSPOSE4_PS_ROW01.val[0]), \ - vget_low_f32(SIMDE_MM_TRANSPOSE4_PS_ROW23.val[0])); \ - row1 = vcombine_f32(vget_low_f32(SIMDE_MM_TRANSPOSE4_PS_ROW01.val[1]), \ - vget_low_f32(SIMDE_MM_TRANSPOSE4_PS_ROW23.val[1])); \ - row2 = vcombine_f32(vget_high_f32(SIMDE_MM_TRANSPOSE4_PS_ROW01.val[0]), \ - vget_high_f32(SIMDE_MM_TRANSPOSE4_PS_ROW23.val[0])); \ - row3 = vcombine_f32(vget_high_f32(SIMDE_MM_TRANSPOSE4_PS_ROW01.val[1]), \ - vget_high_f32(SIMDE_MM_TRANSPOSE4_PS_ROW23.val[1])); \ - } while (0) -#else - #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ - do { \ - simde__m128 SIMDE_MM_TRANSPOSE4_PS_tmp3, SIMDE_MM_TRANSPOSE4_PS_tmp2, SIMDE_MM_TRANSPOSE4_PS_tmp1, SIMDE_MM_TRANSPOSE4_PS_tmp0; \ - SIMDE_MM_TRANSPOSE4_PS_tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \ - SIMDE_MM_TRANSPOSE4_PS_tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \ - SIMDE_MM_TRANSPOSE4_PS_tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \ - SIMDE_MM_TRANSPOSE4_PS_tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \ - row0 = simde_mm_movelh_ps(SIMDE_MM_TRANSPOSE4_PS_tmp0, SIMDE_MM_TRANSPOSE4_PS_tmp2); \ - row1 = simde_mm_movehl_ps(SIMDE_MM_TRANSPOSE4_PS_tmp2, SIMDE_MM_TRANSPOSE4_PS_tmp0); \ - row2 = simde_mm_movelh_ps(SIMDE_MM_TRANSPOSE4_PS_tmp1, SIMDE_MM_TRANSPOSE4_PS_tmp3); \ - row3 = simde_mm_movehl_ps(SIMDE_MM_TRANSPOSE4_PS_tmp3, SIMDE_MM_TRANSPOSE4_PS_tmp1); \ - } while (0) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE_H) */ diff --git a/extern/simde/x86/sse2.h b/extern/simde/x86/sse2.h deleted file mode 100644 index 024fe26ac..000000000 --- a/extern/simde/x86/sse2.h +++ /dev/null @@ -1,7737 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - * 2015-2017 John W. Ratcliff - * 2015 Brandon Rowlett - * 2015 Ken Fast - * 2017 Hasindu Gamaarachchi - * 2018 Jeff Daily - */ - -#if !defined(SIMDE_X86_SSE2_H) -#define SIMDE_X86_SSE2_H - -#include "sse.h" -#include "../simde-f16.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #endif - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 simde_float16 f16[8]; - #endif - SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - - SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 int8_t i8[16]; - SIMDE_ALIGN_TO_16 int16_t i16[8]; - SIMDE_ALIGN_TO_16 int32_t i32[4]; - SIMDE_ALIGN_TO_16 int64_t i64[2]; - SIMDE_ALIGN_TO_16 uint8_t u8[16]; - SIMDE_ALIGN_TO_16 uint16_t u16[8]; - SIMDE_ALIGN_TO_16 uint32_t u32[4]; - SIMDE_ALIGN_TO_16 uint64_t u64[2]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128[1]; - SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; - #endif - SIMDE_ALIGN_TO_16 simde_float16 f16[8]; - SIMDE_ALIGN_TO_16 simde_float32 f32[4]; - SIMDE_ALIGN_TO_16 simde_float64 f64[2]; - - SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2]; - SIMDE_ALIGN_TO_16 simde__m64 m64[2]; - - #if defined(SIMDE_X86_SSE2_NATIVE) - SIMDE_ALIGN_TO_16 __m128i n; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_TO_16 int8x16_t neon_i8; - SIMDE_ALIGN_TO_16 int16x8_t neon_i16; - SIMDE_ALIGN_TO_16 int32x4_t neon_i32; - SIMDE_ALIGN_TO_16 int64x2_t neon_i64; - SIMDE_ALIGN_TO_16 uint8x16_t neon_u8; - SIMDE_ALIGN_TO_16 uint16x8_t neon_u16; - SIMDE_ALIGN_TO_16 uint32x4_t neon_u32; - SIMDE_ALIGN_TO_16 uint64x2_t neon_u64; - #if defined(__ARM_FP16_FORMAT_IEEE) - SIMDE_ALIGN_TO_16 float16x8_t neon_f16; - #endif - SIMDE_ALIGN_TO_16 float32x4_t neon_f32; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_ALIGN_TO_16 float64x2_t neon_f64; - #endif - #elif defined(SIMDE_MIPS_MSA_NATIVE) - v16i8 msa_i8; - v8i16 msa_i16; - v4i32 msa_i32; - v2i64 msa_i64; - v16u8 msa_u8; - v8u16 msa_u16; - v4u32 msa_u32; - v2u64 msa_u64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_ALIGN_TO_16 v128_t wasm_v128; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; - #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f; - #else - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f; - #endif - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; - #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f; - #else - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f; - #endif - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; - #endif - #endif -} simde__m128i_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 int8_t i8[16]; - SIMDE_ALIGN_TO_16 int16_t i16[8]; - SIMDE_ALIGN_TO_16 int32_t i32[4]; - SIMDE_ALIGN_TO_16 int64_t i64[2]; - SIMDE_ALIGN_TO_16 uint8_t u8[16]; - SIMDE_ALIGN_TO_16 uint16_t u16[8]; - SIMDE_ALIGN_TO_16 uint32_t u32[4]; - SIMDE_ALIGN_TO_16 uint64_t u64[2]; - SIMDE_ALIGN_TO_16 simde_float32 f32[4]; - SIMDE_ALIGN_TO_16 simde_float64 f64[2]; - SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2]; - SIMDE_ALIGN_TO_16 simde__m64 m64[2]; - - #if defined(SIMDE_X86_SSE2_NATIVE) - SIMDE_ALIGN_TO_16 __m128d n; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_TO_16 int8x16_t neon_i8; - SIMDE_ALIGN_TO_16 int16x8_t neon_i16; - SIMDE_ALIGN_TO_16 int32x4_t neon_i32; - SIMDE_ALIGN_TO_16 int64x2_t neon_i64; - SIMDE_ALIGN_TO_16 uint8x16_t neon_u8; - SIMDE_ALIGN_TO_16 uint16x8_t neon_u16; - SIMDE_ALIGN_TO_16 uint32x4_t neon_u32; - SIMDE_ALIGN_TO_16 uint64x2_t neon_u64; - SIMDE_ALIGN_TO_16 float32x4_t neon_f32; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_ALIGN_TO_16 float64x2_t neon_f64; - #endif - #elif defined(SIMDE_MIPS_MSA_NATIVE) - v16i8 msa_i8; - v8i16 msa_i16; - v4i32 msa_i32; - v2i64 msa_i64; - v16u8 msa_u8; - v8u16 msa_u16; - v4u32 msa_u32; - v2u64 msa_u64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_ALIGN_TO_16 v128_t wasm_v128; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; - #if defined(__INT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f; - #else - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f; - #endif - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; - #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f; - #else - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f; - #endif - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; - #endif - #endif -} simde__m128d_private; - -#if defined(SIMDE_X86_SSE2_NATIVE) - typedef __m128i simde__m128i; - typedef __m128d simde__m128d; -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - typedef int64x2_t simde__m128i; -# if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - typedef float64x2_t simde__m128d; -# elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; -# else - typedef simde__m128d_private simde__m128d; -# endif -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - typedef v128_t simde__m128i; - typedef v128_t simde__m128d; -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d; - #else - typedef simde__m128d_private simde__m128d; - #endif -#elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - typedef simde_float64 simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; -#else - typedef simde__m128i_private simde__m128i; - typedef simde__m128d_private simde__m128d; -#endif - -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - typedef simde__m128i __m128i; - typedef simde__m128d __m128d; -#endif - -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect"); -#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned"); -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde__m128i_from_private(simde__m128i_private v) { - simde__m128i r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i_private -simde__m128i_to_private(simde__m128i v) { - simde__m128i_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde__m128d_from_private(simde__m128d_private v) { - simde__m128d r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d_private -simde__m128d_to_private(simde__m128d v) { - simde__m128d_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32) - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64) - #endif -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32) - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64) - #endif -#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32) - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64) - #endif -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32) - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64) - #if defined(SIMDE_BUG_GCC_95782) - SIMDE_FUNCTION_ATTRIBUTES - SIMDE_POWER_ALTIVEC_VECTOR(double) - simde__m128d_to_altivec_f64(simde__m128d value) { - simde__m128d_private r_ = simde__m128d_to_private(value); - return r_.altivec_f64; - } - - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) { - simde__m128d_private r_; - r_.altivec_f64 = value; - return simde__m128d_from_private(r_); - } - #else - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64) - #endif - #endif -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128); - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128); -#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_pd(e1, e0); - #else - simde__m128d_private r_; - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_make(e0, e1); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; - r_.neon_f64 = vld1q_f64(data); - #else - r_.f64[0] = e0; - r_.f64[1] = e1; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_set1_pd (simde_float64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set1_pd(a); - #else - simde__m128d_private r_; - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_splat(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdupq_n_f64(a); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.f64[i] = a; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#define simde_mm_set_pd1(a) simde_mm_set1_pd(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_pd(a) simde_mm_set1_pd(a) - #define _mm_set_pd1(a) simde_mm_set1_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_abs_pd(simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - simde_float64 mask_; - uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF); - simde_memcpy(&mask_, &u64_, sizeof(u64_)); - return _mm_and_pd(_mm_set1_pd(mask_), a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vabsq_f64(a_.neon_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_abs(a_.altivec_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_fabs(a_.f64[i]); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_not_pd(simde__m128d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - __m128i ai = _mm_castpd_si128(a); - return _mm_castsi128_pd(_mm_ternarylogic_epi64(ai, ai, ai, 0x55)); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmvnq_s32(a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_nor(a_.altivec_f64, a_.altivec_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~(a_.i32f[i]); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) { - /* This function is for when you want to blend two elements together - * according to a mask. It is similar to _mm_blendv_pd, except that - * it is undefined whether the blend is based on the highest bit in - * each lane (like blendv) or just bitwise operations. This allows - * us to implement the function efficiently everywhere. - * - * Basically, you promise that all the lanes in mask are either 0 or - * ~0. */ - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_pd(a, b, mask); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - mask_ = simde__m128d_to_private(mask); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_add_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 + b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] + b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_add_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 + b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] + b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_add_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 + b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] + b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_add_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 + b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] + b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_add_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 + b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] + b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_pd(a, b) simde_mm_add_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_move_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_move_sd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - #if defined(HEDLEY_IBM_VERSION) - r_.altivec_f64 = vec_xxpermdi(a_.altivec_f64, b_.altivec_f64, 1); - #else - r_.altivec_f64 = vec_xxpermdi(b_.altivec_f64, a_.altivec_f64, 1); - #endif - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1); - #else - r_.f64[0] = b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_move_sd(a, b) simde_mm_move_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_broadcastlow_pd(simde__m128d a) { - /* This function broadcasts the first element in the input vector to - * all lanes. It is used to avoid generating spurious exceptions in - * *_sd functions since there may be garbage in the upper lanes. */ - - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castsi128_pd(_mm_shuffle_epi32(_mm_castpd_si128(a), 0x44)); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_splat(a_.altivec_f64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_splat(a_.f64[0]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[0]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_add_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_add_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_add_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.f64[0] = a_.f64[0] + b_.f64[0]; - r_.f64[1] = a_.f64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_sd(a, b) simde_mm_add_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_add_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_add_si64(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64); - #else - r_.i64[0] = a_.i64[0] + b_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_si64(a, b) simde_mm_add_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_adds_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_add_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_adds_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_adds_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_add_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_adds_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_adds_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_add_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_adds_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_adds_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_add_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_adds_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_and_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_and_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_and_pd(a, b) simde_mm_and_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_and_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_and_si128(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_and_si128(a, b) simde_mm_and_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_andnot_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_andnot_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = ~a_.u64[i] & b_.u64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_andnot_si128(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_xor_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_xor_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_avg_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) - uint16_t wa SIMDE_VECTOR(32); - uint16_t wb SIMDE_VECTOR(32); - uint16_t wr SIMDE_VECTOR(32); - SIMDE_CONVERT_VECTOR_(wa, a_.u8); - SIMDE_CONVERT_VECTOR_(wb, b_.u8); - wr = (wa + wb + 1) >> 1; - SIMDE_CONVERT_VECTOR_(r_.u8, wr); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_avg_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) - uint32_t wa SIMDE_VECTOR(32); - uint32_t wb SIMDE_VECTOR(32); - uint32_t wr SIMDE_VECTOR(32); - SIMDE_CONVERT_VECTOR_(wa, a_.u16); - SIMDE_CONVERT_VECTOR_(wb, b_.u16); - wr = (wa + wb + 1) >> 1; - SIMDE_CONVERT_VECTOR_(r_.u16, wr); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setzero_si128 (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setzero_si128(); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vdupq_n_s32(0); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 }; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = 0; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setzero_si128() (simde_mm_setzero_si128()) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_bslli_si128 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - if (HEDLEY_UNLIKELY((imm8 & ~15))) { - return simde_mm_setzero_si128(); - } - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER) - r_.altivec_i8 = - #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - vec_slo - #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */ - vec_sro - #endif - (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8))); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_srb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3))); - #elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.u128[0] = a_.u128[0] << (imm8 * 8); - #else - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i - imm8]; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) - #define simde_mm_bslli_si128(a, imm8) \ - simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8))))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_bslli_si128(a, imm8) __extension__ ({ \ - simde__m128i_from_wasm_v128( \ - wasm_i8x16_shuffle(wasm_i32x4_splat(INT32_C(0)), \ - simde__m128i_to_wasm_v128((a)), \ - ((imm8)&0xF0) ? 0 : 16 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 17 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 18 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 19 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 20 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 21 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 22 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 23 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 24 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 25 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 26 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 27 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 28 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 29 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 30 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 31 - ((imm8)&0xF))); }) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde_tmp_r_; \ - if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - } else { \ - simde_tmp_r_.i8 = \ - SIMDE_SHUFFLE_VECTOR_(8, 16, \ - simde_tmp_z_.i8, \ - (simde_tmp_a_).i8, \ - HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \ - } \ - simde__m128i_from_private(simde_tmp_r_); })) -#endif -#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) - #define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_bsrli_si128 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - if (HEDLEY_UNLIKELY((imm8 & ~15))) { - return simde_mm_setzero_si128(); - } - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER) - r_.altivec_i8 = - #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - vec_sro - #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */ - vec_slo - #endif - (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8))); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_slb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - const int e = HEDLEY_STATIC_CAST(int, i) + imm8; - r_.i8[i] = (e < 16) ? a_.i8[e] : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) - #define simde_mm_bsrli_si128(a, imm8) \ - simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde_tmp_r_ = simde__m128i_to_private(a); \ - if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - } else { \ - simde_tmp_r_.wasm_v128 = \ - wasm_i8x16_shuffle( \ - simde_tmp_z_.wasm_v128, \ - simde_tmp_a_.wasm_v128, \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \ - } \ - simde__m128i_from_private(simde_tmp_r_); })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde_tmp_r_ = simde__m128i_to_private(a); \ - if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - } else { \ - simde_tmp_r_.i8 = \ - SIMDE_SHUFFLE_VECTOR_(8, 16, \ - simde_tmp_z_.i8, \ - (simde_tmp_a_).i8, \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \ - } \ - simde__m128i_from_private(simde_tmp_r_); })) -#endif -#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) - #define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_clflush (void const* p) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_clflush(p); - #else - (void) p; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_clflush(p) simde_mm_clflush(p) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comieq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comieq_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] == b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comige_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comige_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] >= b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comigt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comigt_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] > b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comile_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comile_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] <= b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comilt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comilt_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] < b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comineq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comineq_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] != b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) { - simde__m128d_private - r_, - dest_ = simde__m128d_to_private(dest), - src_ = simde__m128d_to_private(src); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t sign_pos = vreinterpretq_u64_f64(vdupq_n_f64(-SIMDE_FLOAT64_C(0.0))); - #else - simde_float64 dbl_nz = -SIMDE_FLOAT64_C(0.0); - uint64_t u64_nz; - simde_memcpy(&u64_nz, &dbl_nz, sizeof(u64_nz)); - uint64x2_t sign_pos = vdupq_n_u64(u64_nz); - #endif - r_.neon_u64 = vbslq_u64(sign_pos, src_.neon_u64, dest_.neon_u64); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - #if defined(SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS) - r_.altivec_f64 = vec_cpsgn(dest_.altivec_f64, src_.altivec_f64); - #else - r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64); - #endif - #elif defined(simde_math_copysign) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]); - } - #else - simde__m128d sgnbit = simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0)); - return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src), simde_mm_andnot_pd(sgnbit, dest)); - #endif - - return simde__m128d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_xorsign_pd(simde__m128d dest, simde__m128d src) { - return simde_mm_xor_pd(simde_mm_and_pd(simde_mm_set1_pd(-0.0), src), dest); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_castpd_ps (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castpd_ps(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vreinterpretq_f32_f64(a); - #else - simde__m128 r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castpd_ps(a) simde_mm_castpd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_castpd_si128 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castpd_si128(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vreinterpretq_s64_f64(a); - #else - simde__m128i r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castpd_si128(a) simde_mm_castpd_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_castps_pd (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castps_pd(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vreinterpretq_f64_f32(a); - #else - simde__m128d r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castps_pd(a) simde_mm_castps_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_castps_si128 (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castps_si128(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32); - #else - simde__m128i r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castps_si128(a) simde_mm_castps_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_castsi128_pd (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castsi128_pd(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vreinterpretq_f64_s64(a); - #else - simde__m128d r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_castsi128_ps (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castsi128_ps(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32); - #else - simde__m128 r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = (a_.i16 == b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 == b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vceqq_f64(b_.neon_f64, a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64)); - #elif defined(SIMDE_MIPS_MSA_NATIVE) - r_.msa_i32 = __msa_addv_w(a_.msa_i32, b_.msa_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpeq_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0; - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpneq_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpneq_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpneq_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcltq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmplt(a_.altivec_f64, b_.altivec_f64)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmplt_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmple_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmple_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcleq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmple_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmple_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmple_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpgt_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpgt_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpgt_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpgt_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgtq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cmpgt_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpgt_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpge_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgeq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cmpge_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpge_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpngt_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpngt_pd(a, b); - #else - return simde_mm_cmple_pd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpngt_pd(a, b) simde_mm_cmpngt_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpngt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cmpngt_sd(a, b); - #else - return simde_mm_cmple_sd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpngt_sd(a, b) simde_mm_cmpngt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnge_pd(a, b); - #else - return simde_mm_cmplt_pd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cmpnge_sd(a, b); - #else - return simde_mm_cmplt_sd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnlt_pd(a, b); - #else - return simde_mm_cmpge_pd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnlt_sd(a, b); - #else - return simde_mm_cmpge_sd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnle_pd(a, b); - #else - return simde_mm_cmpgt_pd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnle_sd(a, b); - #else - return simde_mm_cmpgt_sd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpord_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - /* Note: NEON does not have ordered compare builtin - Need to compare a eq a and b eq b to check for NaN - Do AND of results to get final */ - uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64); - r_.neon_u64 = vandq_u64(ceqaa, ceqbb); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(wasm_f64x2_eq(a_.wasm_v128, a_.wasm_v128), - wasm_f64x2_eq(b_.wasm_v128, b_.wasm_v128)); - #elif defined(simde_math_isnan) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float64 -simde_mm_cvtsd_f64 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cvtsd_f64(a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(simde_float64, wasm_f64x2_extract_lane(a_.wasm_v128, 0)); - #else - return a_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpord_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpord_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_isnan) - r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpunord_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64); - r_.neon_u64 = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb)))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(wasm_f64x2_ne(a_.wasm_v128, a_.wasm_v128), - wasm_f64x2_ne(b_.wasm_v128, b_.wasm_v128)); - #elif defined(simde_math_isnan) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpunord_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpunord_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_isnan) - r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtepi32_pd (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtepi32_pd(a); - #else - simde__m128d_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_convert_low_i32x4(a_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (simde_float64) a_.i32[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtepi32_ps (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtepi32_ps(a); - #else - simde__m128_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_convert_i32x4(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wc11-extensions") - #pragma clang diagnostic ignored "-Wc11-extensions" - #endif - r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (simde_float32) a_.i32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtpd_pi32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpd_pi32(a); - #else - simde__m64_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float64 v = simde_math_round(a_.f64[i]); - #if defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #else - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - } - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtpd_epi32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_PGI_30107) - return _mm_cvtpd_epi32(a); - #else - simde__m128i_private r_; - - r_.m64[0] = simde_mm_cvtpd_pi32(a); - r_.m64[1] = simde_mm_setzero_si64(); - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpd_ps (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtpd_ps(a); - #else - simde__m128_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vcombine_f32(vcvt_f32_f64(a_.neon_f64), vdup_n_f32(0.0f)); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_float2(a_.altivec_f64, vec_splats(0.0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_demote_f64x2_zero(a_.wasm_v128); - #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && HEDLEY_HAS_BUILTIN(__builtin_convertvector) - float __attribute__((__vector_size__(8))) z = { 0.0f, 0.0f }; - r_.f32 = - __builtin_shufflevector( - __builtin_convertvector(__builtin_shufflevector(a_.f64, a_.f64, 0, 1), __typeof__(z)), z, - 0, 1, 2, 3 - ); - #else - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.f64[0]); - r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.f64[1]); - r_.f32[2] = SIMDE_FLOAT32_C(0.0); - r_.f32[3] = SIMDE_FLOAT32_C(0.0); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtpi32_pd (simde__m64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi32_pd(a); - #else - simde__m128d_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (simde_float64) a_.i32[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtps_epi32 (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtps_epi32(a); - #else - simde__m128i_private r_; - simde__m128_private a_; - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_BUG_GCC_95399) - a_ = simde__m128_to_private(a); - r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) - a_ = simde__m128_to_private(a); - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_ - SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ - r_.altivec_i32 = vec_cts(a_.altivec_f32, 1); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) - a_ = simde__m128_to_private(a); - r_.wasm_v128 = wasm_i32x4_trunc_sat_f32x4(a_.wasm_v128); - #else - a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1)); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float32 v = simde_math_roundf(a_.f32[i]); - #if defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #else - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtps_pd (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtps_pd(a); - #else - simde__m128d_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_promote_low_f32x4(a_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f32[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtsd_si32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsd_si32(a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - simde_float64 v = simde_math_round(a_.f64[0]); - #if defined(SIMDE_FAST_CONVERSION_RANGE) - return SIMDE_CONVERT_FTOI(int32_t, v); - #else - return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvtsd_si64 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if defined(__PGI) - return _mm_cvtsd_si64x(a); - #else - return _mm_cvtsd_si64(a); - #endif - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0])); - #endif -} -#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a) - #define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsd_ss(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - simde__m128d_private b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vsetq_lane_f32(vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0); - #else - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]); - - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i]; - } - #endif - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int16_t -simde_x_mm_cvtsi128_si16 (simde__m128i a) { - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vgetq_lane_s16(a_.neon_i16, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(int16_t, wasm_i16x8_extract_lane(a_.wasm_v128, 0)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - #endif - return vec_extract(a_.altivec_i16, 0); - #else - return a_.i16[0]; - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtsi128_si32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsi128_si32(a); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vgetq_lane_s32(a_.neon_i32, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_extract_lane(a_.wasm_v128, 0)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - #endif - return vec_extract(a_.altivec_i32, 0); - #else - return a_.i32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvtsi128_si64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if defined(__PGI) - return _mm_cvtsi128_si64x(a); - #else - return _mm_cvtsi128_si64(a); - #endif - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION) - return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vgetq_lane_s64(a_.neon_i64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(int64_t, wasm_i64x2_extract_lane(a_.wasm_v128, 0)); - #endif - return a_.i64[0]; - #endif -} -#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a) - #define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsi32_sd(a, b); - #else - simde__m128d_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0); - #else - r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); - r_.i64[1] = a_.i64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cvtsi16_si128 (int16_t a) { - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0); - #else - r_.i16[0] = a; - r_.i16[1] = 0; - r_.i16[2] = 0; - r_.i16[3] = 0; - r_.i16[4] = 0; - r_.i16[5] = 0; - r_.i16[6] = 0; - r_.i16[7] = 0; - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtsi32_si128 (int32_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsi32_si128(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0); - #else - r_.i32[0] = a; - r_.i32[1] = 0; - r_.i32[2] = 0; - r_.i32[3] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvtsi64_sd(a, b); - #else - return _mm_cvtsi64x_sd(a, b); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0); - #else - r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b) - #define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtsi64_si128 (int64_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvtsi64_si128(a); - #else - return _mm_cvtsi64x_si128(a); - #endif - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_make(a, 0); - #else - r_.i64[0] = a; - r_.i64[1] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a) - #define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtss_sd(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vcvt_f64_f32(vset_lane_f32(vgetq_lane_f32(simde__m128_to_private(b).neon_f32, 0), vdup_n_f32(0), 0)); - return vsetq_lane_f64(vgetq_lane_f64(simde__m128d_to_private(a).neon_f64, 1), temp, 1); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a); - simde__m128_private b_ = simde__m128_to_private(b); - - a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]); - - return simde__m128d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvttpd_pi32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvttpd_pi32(a); - #else - simde__m64_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64); - #else - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float64 v = a_.f64[i]; - #if defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #else - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvttpd_epi32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvttpd_epi32(a); - #else - simde__m128i_private r_; - - r_.m64[0] = simde_mm_cvttpd_pi32(a); - r_.m64[1] = simde_mm_setzero_si64(); - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvttps_epi32 (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvttps_epi32(a); - #else - simde__m128i_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32); - - #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) - /* Values below INT32_MIN saturate anyways, so we don't need to - * test for that. */ - #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) - uint32x4_t valid_input = - vandq_u32( - vcltq_f32(a_.neon_f32, vdupq_n_f32(SIMDE_FLOAT32_C(2147483648.0))), - vceqq_f32(a_.neon_f32, a_.neon_f32) - ); - #elif !defined(SIMDE_FAST_CONVERSION_RANGE) - uint32x4_t valid_input = vcltq_f32(a_.neon_f32, vdupq_n_f32(SIMDE_FLOAT32_C(2147483648.0))); - #elif !defined(SIMDE_FAST_NANS) - uint32x4_t valid_input = vceqq_f32(a_.neon_f32, a_.neon_f32); - #endif - - r_.neon_i32 = vbslq_s32(valid_input, r_.neon_i32, vdupq_n_s32(INT32_MIN)); - #endif - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_trunc_sat_f32x4(a_.wasm_v128); - - #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) - #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) - v128_t valid_input = - wasm_v128_and( - wasm_f32x4_lt(a_.wasm_v128, wasm_f32x4_splat(SIMDE_FLOAT32_C(2147483648.0))), - wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128) - ); - #elif !defined(SIMDE_FAST_CONVERSION_RANGE) - v128_t valid_input = wasm_f32x4_lt(a_.wasm_v128, wasm_f32x4_splat(SIMDE_FLOAT32_C(2147483648.0))); - #elif !defined(SIMDE_FAST_NANS) - v128_t valid_input = wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128); - #endif - - r_.wasm_v128 = wasm_v128_bitselect(r_.wasm_v128, wasm_i32x4_splat(INT32_MIN), valid_input); - #endif - #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_ARCH_POWER) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); - - #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - static const simde_float32 SIMDE_VECTOR(16) first_too_high = { SIMDE_FLOAT32_C(2147483648.0), SIMDE_FLOAT32_C(2147483648.0), SIMDE_FLOAT32_C(2147483648.0), SIMDE_FLOAT32_C(2147483648.0) }; - - __typeof__(r_.i32) valid_input = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.i32), - (a_.f32 < first_too_high) & (a_.f32 >= -first_too_high) - ); - #elif !defined(SIMDE_FAST_NANS) - __typeof__(r_.i32) valid_input = HEDLEY_REINTERPRET_CAST( __typeof__(valid_input), a_.f32 == a_.f32); - #endif - - __typeof__(r_.i32) invalid_output = { INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN }; - r_.i32 = (r_.i32 & valid_input) | (invalid_output & ~valid_input); - #endif - #else - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float32 v = a_.f32[i]; - #if defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_NANS) - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #else - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvttsd_si32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvttsd_si32(a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - simde_float64 v = a_.f64[0]; - #if defined(SIMDE_FAST_CONVERSION_RANGE) - return SIMDE_CONVERT_FTOI(int32_t, v); - #else - return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvttsd_si64 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvttsd_si64(a); - #else - return _mm_cvttsd_si64x(a); - #endif - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]); - #endif -} -#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a) - #define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_div_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 / b_.f64; - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] / b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_div_pd(a, b) simde_mm_div_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_div_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_div_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_div_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64); - r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); - #else - r_.f64[0] = a_.f64[0] / b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_div_sd(a, b) simde_mm_div_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_extract_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - uint16_t r; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - (void) imm8; - #endif - r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8)); - #else - r = a_.u16[imm8 & 7]; - #endif - - return HEDLEY_STATIC_CAST(int32_t, r); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0)) - #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_extract_epi16(a, imm8) HEDLEY_STATIC_CAST(int32_t, wasm_u16x8_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m128i_private a_ = simde__m128i_to_private(a); - a_.i16[imm8 & 7] = i; - return simde__m128i_from_private(a_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - #define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_insert_epi16(a, i, imm8) wasm_i16x8_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7, (i)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_load_pd(mem_addr); - #else - simde__m128d_private r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vld1q_f64(mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load(mem_addr); - #else - simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_)); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_load1_pd (simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_load1_pd(mem_addr); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_v128_load64_splat(mem_addr)); - #else - return simde_mm_set1_pd(*mem_addr); - #endif -} -#define simde_mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr) - #define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_load_sd (simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_load_sd(mem_addr); - #else - simde__m128d_private r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_zero(HEDLEY_REINTERPRET_CAST(const void*, mem_addr)); - #else - r_.f64[0] = *mem_addr; - r_.u64[1] = UINT64_C(0); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_load_si128 (simde__m128i const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr)); - #else - simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadh_pd(a, mem_addr); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 1); - #else - simde_float64 t; - - simde_memcpy(&t, mem_addr, sizeof(t)); - r_.f64[0] = a_.f64[0]; - r_.f64[1] = t; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadl_epi64 (simde__m128i const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadl_epi64(mem_addr); - #else - simde__m128i_private r_; - - int64_t value; - simde_memcpy(&value, mem_addr, sizeof(value)); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0)); - #else - r_.i64[0] = value; - r_.i64[1] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadl_pd(a, mem_addr); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vcombine_f64(vld1_f64( - HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 0); - #else - r_.f64[0] = *mem_addr; - r_.u64[1] = a_.u64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadr_pd(mem_addr); - #else - simde__m128d_private - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vld1q_f64(mem_addr); - r_.neon_f64 = vextq_f64(r_.neon_f64, r_.neon_f64, 1); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)); - r_.neon_i64 = vextq_s64(r_.neon_i64, r_.neon_i64, 1); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t tmp = wasm_v128_load(mem_addr); - r_.wasm_v128 = wasm_i64x2_shuffle(tmp, tmp, 1, 0); - #else - r_.f64[0] = mem_addr[1]; - r_.f64[1] = mem_addr[0]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_pd(mem_addr); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vld1q_f64(mem_addr); - #else - simde__m128d_private r_; - - simde_memcpy(&r_, mem_addr, sizeof(r_)); - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ - && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm_loadu_epi8(mem_addr) _mm_loadu_epi8(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_epi8(void const * mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#endif -#define simde_x_mm_loadu_epi8(mem_addr) simde_mm_loadu_epi8(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm_loadu_epi8 - #define _mm_loadu_epi8(a) simde_mm_loadu_epi8(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ - && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm_loadu_epi16(mem_addr) _mm_loadu_epi16(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_epi16(void const * mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vreinterpretq_s16_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#endif -#define simde_x_mm_loadu_epi16(mem_addr) simde_mm_loadu_epi16(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm_loadu_epi16 - #define _mm_loadu_epi16(a) simde_mm_loadu_epi16(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ - && !defined(SIMDE_BUG_CLANG_REV_344862) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm_loadu_epi32(mem_addr) _mm_loadu_epi32(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_epi32(void const * mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vreinterpretq_s32_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#endif -#define simde_x_mm_loadu_epi32(mem_addr) simde_mm_loadu_epi32(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm_loadu_epi32 - #define _mm_loadu_epi32(a) simde_mm_loadu_epi32(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ - && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm_loadu_epi64(mem_addr) _mm_loadu_epi64(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_epi64(void const * mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vreinterpretq_s64_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#endif -#define simde_x_mm_loadu_epi64(mem_addr) simde_mm_loadu_epi64(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm_loadu_epi64 - #define _mm_loadu_epi64(a) simde_mm_loadu_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_si128 (void const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr)); - #else - simde__m128i_private r_; - - #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_PACKED_ - struct simde_mm_loadu_si128_s { - __typeof__(r_) v; - } __attribute__((__packed__, __may_alias__)); - r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v; - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_madd_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16)); - int32x4_t ph = vmull_high_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i32 = vpaddq_s32(pl, ph); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16)); - int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16)); - int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); - int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); - r_.neon_i32 = vcombine_s32(rl, rh); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, vec_splats(0)); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_mule(a_.altivec_i16, b_.altivec_i16) + vec_mulo(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_dot_i16x8(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - int32_t SIMDE_VECTOR(32) a32, b32, p32; - SIMDE_CONVERT_VECTOR_(a32, a_.i16); - SIMDE_CONVERT_VECTOR_(b32, b_.i16); - p32 = a32 * b32; - r_.i32 = - __builtin_shufflevector(p32, p32, 0, 2, 4, 6) + - __builtin_shufflevector(p32, p32, 1, 3, 5, 7); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) { - r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr)); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - mask_ = simde__m128i_to_private(mask); - - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - if (mask_.u8[i] & 0x80) { - mem_addr[i] = a_.i8[i]; - } - } - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_movemask_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER) - /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */ - return _mm_movemask_epi8(a); - #else - int32_t r = 0; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* https://github.com/WebAssembly/simd/pull/201#issue-380682845 */ - static const uint8_t md[16] = { - 1 << 0, 1 << 1, 1 << 2, 1 << 3, - 1 << 4, 1 << 5, 1 << 6, 1 << 7, - 1 << 0, 1 << 1, 1 << 2, 1 << 3, - 1 << 4, 1 << 5, 1 << 6, 1 << 7, - }; - - /* Extend sign bit over entire lane */ - uint8x16_t extended = vreinterpretq_u8_s8(vshrq_n_s8(a_.neon_i8, 7)); - /* Clear all but the bit we're interested in. */ - uint8x16_t masked = vandq_u8(vld1q_u8(md), extended); - /* Alternate bytes from low half and high half */ - uint8x8x2_t tmp = vzip_u8(vget_low_u8(masked), vget_high_u8(masked)); - uint16x8_t x = vreinterpretq_u16_u8(vcombine_u8(tmp.val[0], tmp.val[1])); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r = vaddvq_u16(x); - #else - uint64x2_t t64 = vpaddlq_u32(vpaddlq_u16(x)); - r = - HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u64(t64, 0)) + - HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u64(t64, 1)); - #endif - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 }; - r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1)); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 }; - r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = HEDLEY_STATIC_CAST(int32_t, wasm_i8x16_bitmask(a_.wasm_v128)); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r |= (a_.u8[15 - i] >> 7) << (15 - i); - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_movemask_pd (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_movemask_pd(a); - #else - int32_t r = 0; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ - uint64x2_t shifted = vshrq_n_u64(a_.neon_u64, 63); - r = - HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u64(shifted, 0)) + - (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u64(shifted, 1)) << 1); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && defined(SIMDE_BUG_CLANG_50932) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 64, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_bperm(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a_.altivec_u64), idx)); - r = HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 64, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = vec_bperm(a_.altivec_u8, idx); - r = HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_bitmask(a_.wasm_v128)); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= (a_.u64[i] >> 63) << i; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_movemask_pd(a) simde_mm_movemask_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_movepi64_pi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_movepi64_pi64(a); - #else - simde__m64_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i64 = vget_low_s64(a_.neon_i64); - #else - r_.i64[0] = a_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_movpi64_epi64 (simde__m64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_movpi64_epi64(a); - #else - simde__m128i_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0)); - #else - r_.i64[0] = a_.i64[0]; - r_.i64[1] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_min_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_min_pd(a, b) simde_mm_min_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_min_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_min_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_min_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64); - r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); - #else - r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_min_sd(a, b) simde_mm_min_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_max_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_max_pd(a, b) simde_mm_max_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_max_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_max_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_max_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64); - r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); - #else - r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_max_sd(a, b) simde_mm_max_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_move_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_move_epi64(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, wasm_i64x2_const(0, 0), 0, 2); - #else - r_.i64[0] = a_.i64[0]; - r_.i64[1] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_move_epi64(a) simde_mm_move_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mul_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x2_t a_lo = vmovn_u64(a_.neon_u64); - uint32x2_t b_lo = vmovn_u64(b_.neon_u64); - r_.neon_u64 = vmull_u32(a_lo, b_lo); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u64x2_extmul_low_u32x4( - wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 2, 0, 2), - wasm_i32x4_shuffle(b_.wasm_v128, b_.wasm_v128, 0, 2, 0, 2)); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(a_.u32) z = { 0, }; - a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6); - b_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, b_.u32, z, 0, 4, 2, 6); - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u32) * - HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_mul(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 * b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] * b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i64 = a_.i64 % b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] % b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mul_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mul_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 * b_.f64; - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] * b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mul_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mul_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_mul_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_mul_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64); - r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); - #else - r_.f64[0] = a_.f64[0] * b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mul_su32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_mul_su32(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.u64[0] = vget_lane_u64(vget_low_u64(vmull_u32(vreinterpret_u32_s64(a_.neon_i64), vreinterpret_u32_s64(b_.neon_i64))), 0); - #else - r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]); - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mulhi_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4_t a3210 = vget_low_s16(a_.neon_i16); - int16x4_t b3210 = vget_low_s16(b_.neon_i16); - int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int32x4_t ab7654 = vmull_high_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vuzp2q_s16(vreinterpretq_s16_s32(ab3210), vreinterpretq_s16_s32(ab7654)); - #else - int16x4_t a7654 = vget_high_s16(a_.neon_i16); - int16x4_t b7654 = vget_high_s16(b_.neon_i16); - int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ - uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); - r_.neon_u16 = rv.val[1]; - #endif - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t lo = wasm_i32x4_extmul_low_i16x8(a_.wasm_v128, b_.wasm_v128); - const v128_t hi = wasm_i32x4_extmul_high_i16x8(a_.wasm_v128, b_.wasm_v128); - r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_mulhi_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint16x4_t a3210 = vget_low_u16(a_.neon_u16); - uint16x4_t b3210 = vget_low_u16(b_.neon_u16); - uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */ - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint32x4_t ab7654 = vmull_high_u16(a_.neon_u16, b_.neon_u16); - r_.neon_u16 = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); - #else - uint16x4_t a7654 = vget_high_u16(a_.neon_u16); - uint16x4_t b7654 = vget_high_u16(b_.neon_u16); - uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */ - uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); - r_.neon_u16 = neon_r.val[1]; - #endif - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t lo = wasm_u32x4_extmul_low_u16x8(a_.wasm_v128, b_.wasm_v128); - const v128_t hi = wasm_u32x4_extmul_high_u16x8(a_.wasm_v128, b_.wasm_v128); - r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mullo_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - (void) a_; - (void) b_; - r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_mul(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_or_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_or_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_or_pd(a, b) simde_mm_or_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_or_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_or_si128(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_or_si128(a, b) simde_mm_or_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_packs_epi16(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vqmovn_high_s16(vqmovn_s16(a_.neon_i16), b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_packs(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_narrow_i16x8(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - int16_t SIMDE_VECTOR(32) v = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const int16_t SIMDE_VECTOR(32) min = { INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN }; - const int16_t SIMDE_VECTOR(32) max = { INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX }; - - int16_t m SIMDE_VECTOR(32); - m = HEDLEY_REINTERPRET_CAST(__typeof__(m), v < min); - v = (v & ~m) | (min & m); - - m = v > max; - v = (v & ~m) | (max & m); - - SIMDE_CONVERT_VECTOR_(r_.i8, v); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - int16_t v = (i < (sizeof(a_.i16) / sizeof(a_.i16[0]))) ? a_.i16[i] : b_.i16[i & 7]; - r_.i8[i] = (v < INT8_MIN) ? INT8_MIN : ((v > INT8_MAX) ? INT8_MAX : HEDLEY_STATIC_CAST(int8_t, v)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_packs_epi32(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vqmovn_high_s32(vqmovn_s32(a_.neon_i32), b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.sse_m128i = _mm_packs_epi32(a_.sse_m128i, b_.sse_m128i); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_narrow_i32x4(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - int32_t SIMDE_VECTOR(32) v = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 1, 2, 3, 4, 5, 6, 7); - const int32_t SIMDE_VECTOR(32) min = { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }; - const int32_t SIMDE_VECTOR(32) max = { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX }; - - int32_t m SIMDE_VECTOR(32); - m = HEDLEY_REINTERPRET_CAST(__typeof__(m), v < min); - v = (v & ~m) | (min & m); - - m = HEDLEY_REINTERPRET_CAST(__typeof__(m), v > max); - v = (v & ~m) | (max & m); - - SIMDE_CONVERT_VECTOR_(r_.i16, v); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - int32_t v = (i < (sizeof(a_.i32) / sizeof(a_.i32[0]))) ? a_.i32[i] : b_.i32[i & 3]; - r_.i16[i] = (v < INT16_MIN) ? INT16_MIN : ((v > INT16_MAX) ? INT16_MAX : HEDLEY_STATIC_CAST(int16_t, v)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_packus_epi16(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #if defined(SIMDE_BUG_CLANG_46840) - r_.neon_u8 = vqmovun_high_s16(vreinterpret_s8_u8(vqmovun_s16(a_.neon_i16)), b_.neon_i16); - #else - r_.neon_u8 = vqmovun_high_s16(vqmovun_s16(a_.neon_i16), b_.neon_i16); - #endif - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = - vcombine_u8( - vqmovun_s16(a_.neon_i16), - vqmovun_s16(b_.neon_i16) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_narrow_i16x8(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int16_t v SIMDE_VECTOR(32) = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - - v &= ~(v >> 15); - v |= HEDLEY_REINTERPRET_CAST(__typeof__(v), v > UINT8_MAX); - - SIMDE_CONVERT_VECTOR_(r_.i8, v); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - int16_t v = (i < (sizeof(a_.i16) / sizeof(a_.i16[0]))) ? a_.i16[i] : b_.i16[i & 7]; - r_.u8[i] = (v < 0) ? UINT8_C(0) : ((v > UINT8_MAX) ? UINT8_MAX : HEDLEY_STATIC_CAST(uint8_t, v)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_pause (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_pause(); - #elif defined(SIMDE_ARCH_X86) - __asm__ __volatile__("pause"); - #elif defined(SIMDE_ARCH_ARM_NEON) - #if defined(_MSC_VER) - __isb(_ARM64_BARRIER_SY); - #else - __asm__ __volatile__("isb\n"); - #endif - #elif defined(SIMDE_ARCH_POWER) - __asm__ __volatile__ ("or 27,27,27" ::: "memory"); - #elif defined(SIMDE_ARCH_WASM) - __asm__ __volatile__ ("nop"); - #elif defined(HEDLEY_GCC_VERSION) - #if defined(SIMDE_ARCH_RISCV) - __builtin_riscv_pause(); - #else - __asm__ __volatile__ ("nop" ::: "memory"); - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_pause() (simde_mm_pause()) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sad_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8)); - r_.neon_u64 = vcombine_u64( - vpaddl_u32(vpaddl_u16(vget_low_u16(t))), - vpaddl_u32(vpaddl_u16(vget_high_u16(t)))); - #else - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - uint16_t tmp = 0; - SIMDE_VECTORIZE_REDUCTION(+:tmp) - for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) { - const size_t e = j + (i * 8); - tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]); - } - r_.i64[i] = tmp; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12, - int8_t e11, int8_t e10, int8_t e9, int8_t e8, - int8_t e7, int8_t e6, int8_t e5, int8_t e4, - int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi8( - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_make( - e0, e1, e2, e3, e4, e5, e6, e7, - e8, e9, e10, e11, e12, e13, e14, e15); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(int8x16_t) int8_t data[16] = { - e0, e1, e2, e3, - e4, e5, e6, e7, - e8, e9, e10, e11, - e12, e13, e14, e15}; - r_.neon_i8 = vld1q_s8(data); - #else - r_.i8[ 0] = e0; - r_.i8[ 1] = e1; - r_.i8[ 2] = e2; - r_.i8[ 3] = e3; - r_.i8[ 4] = e4; - r_.i8[ 5] = e5; - r_.i8[ 6] = e6; - r_.i8[ 7] = e7; - r_.i8[ 8] = e8; - r_.i8[ 9] = e9; - r_.i8[10] = e10; - r_.i8[11] = e11; - r_.i8[12] = e12; - r_.i8[13] = e13; - r_.i8[14] = e14; - r_.i8[15] = e15; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4, - int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.neon_i16 = vld1q_s16(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); - #else - r_.i16[0] = e0; - r_.i16[1] = e1; - r_.i16[2] = e2; - r_.i16[3] = e3; - r_.i16[4] = e4; - r_.i16[5] = e5; - r_.i16[6] = e6; - r_.i16[7] = e7; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_si16 (void const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ - HEDLEY_GCC_VERSION_CHECK(12,1,0)) - return _mm_loadu_si16(mem_addr); - #else - int16_t val; - simde_memcpy(&val, mem_addr, sizeof(val)); - return simde_x_mm_cvtsi16_si128(val); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_si16(mem_addr) simde_mm_loadu_si16(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi32(e3, e2, e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(int32x4_t) int32_t data[4] = { e0, e1, e2, e3 }; - r_.neon_i32 = vld1q_s32(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3); - #else - r_.i32[0] = e0; - r_.i32[1] = e1; - r_.i32[2] = e2; - r_.i32[3] = e3; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi32(e3, e2, e1, e0) simde_mm_set_epi32(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_si32 (void const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ - HEDLEY_GCC_VERSION_CHECK(12,1,0)) - return _mm_loadu_si32(mem_addr); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_v128_load32_zero(mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m128i_private r_; - r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0); - return simde__m128i_from_private(r_); - #else - int32_t val; - simde_memcpy(&val, mem_addr, sizeof(val)); - return simde_mm_cvtsi32_si128(val); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_si32(mem_addr) simde_mm_loadu_si32(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_set_epi64(e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1)); - #else - r_.m64[0] = e0; - r_.m64[1] = e1; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi64x (int64_t e1, int64_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0)) - return _mm_set_epi64x(e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(int64x2_t) int64_t data[2] = {e0, e1}; - r_.neon_i64 = vld1q_s64(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_make(e0, e1); - #else - r_.i64[0] = e0; - r_.i64[1] = e1; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_si64 (void const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) - return _mm_loadu_si64(mem_addr); - #else - int64_t val; - simde_memcpy(&val, mem_addr, sizeof(val)); - return simde_mm_cvtsi64_si128(val); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_si64(mem_addr) simde_mm_loadu_si64(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, - uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8, - uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, - uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi8( - HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12), - HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char, e9), HEDLEY_STATIC_CAST(char, e8), - HEDLEY_STATIC_CAST(char, e7), HEDLEY_STATIC_CAST(char, e6), HEDLEY_STATIC_CAST(char, e5), HEDLEY_STATIC_CAST(char, e4), - HEDLEY_STATIC_CAST(char, e3), HEDLEY_STATIC_CAST(char, e2), HEDLEY_STATIC_CAST(char, e1), HEDLEY_STATIC_CAST(char, e0)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(uint8x16_t) uint8_t data[16] = { - e0, e1, e2, e3, - e4, e5, e6, e7, - e8, e9, e10, e11, - e12, e13, e14, e15}; - r_.neon_u8 = vld1q_u8(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); - #else - r_.u8[ 0] = e0; r_.u8[ 1] = e1; r_.u8[ 2] = e2; r_.u8[ 3] = e3; - r_.u8[ 4] = e4; r_.u8[ 5] = e5; r_.u8[ 6] = e6; r_.u8[ 7] = e7; - r_.u8[ 8] = e8; r_.u8[ 9] = e9; r_.u8[10] = e10; r_.u8[11] = e11; - r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15; - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, - uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi16( - HEDLEY_STATIC_CAST(short, e7), HEDLEY_STATIC_CAST(short, e6), HEDLEY_STATIC_CAST(short, e5), HEDLEY_STATIC_CAST(short, e4), - HEDLEY_STATIC_CAST(short, e3), HEDLEY_STATIC_CAST(short, e2), HEDLEY_STATIC_CAST(short, e1), HEDLEY_STATIC_CAST(short, e0)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(uint16x8_t) uint16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.neon_u16 = vld1q_u16(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); - #else - r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3; - r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7; - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi32( - HEDLEY_STATIC_CAST(int, e3), HEDLEY_STATIC_CAST(int, e2), HEDLEY_STATIC_CAST(int, e1), HEDLEY_STATIC_CAST(int, e0)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = { e0, e1, e2, e3 }; - r_.neon_u32 = vld1q_u32(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_make(e0, e1, e2, e3); - #else - r_.u32[0] = e0; - r_.u32[1] = e1; - r_.u32[2] = e2; - r_.u32[3] = e3; - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0)) - return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, e1), HEDLEY_STATIC_CAST(int64_t, e0)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1}; - r_.neon_u64 = vld1q_u64(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u64x2_make(e0, e1); - #else - r_.u64[0] = e0; - r_.u64[1] = e1; - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_set_sd (simde_float64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_sd(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_f64x2_make(a, 0)); - #else - return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_sd(a) simde_mm_set_sd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi8 (int8_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set1_epi8(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vdupq_n_s8(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_splat(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi8(a) simde_mm_set1_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi16 (int16_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set1_epi16(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vdupq_n_s16(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_splat(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi16(a) simde_mm_set1_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi32 (int32_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set1_epi32(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vdupq_n_s32(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_splat(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi32(a) simde_mm_set1_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi64x (int64_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0)) - return _mm_set1_epi64x(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vdupq_n_s64(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_splat(a); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi64 (simde__m64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_set1_epi64(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - return simde_mm_set1_epi64x(a_.i64[0]); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi64(a) simde_mm_set1_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set1_epu8 (uint8_t value) { - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_u8x16_splat(value)); - #else - return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value)); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set1_epu16 (uint16_t value) { - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_u16x8_splat(value)); - #else - return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value)); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set1_epu32 (uint32_t value) { - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_u32x4_splat(value)); - #else - return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value)); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set1_epu64 (uint64_t value) { - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_u64x2_splat(value)); - #else - return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value)); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12, - int8_t e11, int8_t e10, int8_t e9, int8_t e8, - int8_t e7, int8_t e6, int8_t e5, int8_t e4, - int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setr_epi8( - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm_set_epi8( - e0, e1, e2, e3, e4, e5, e6, e7, - e8, e9, e10, e11, e12, e13, e14, e15); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4, - int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setr_epi32(e3, e2, e1, e0); - #else - return simde_mm_set_epi32(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_setr_epi64(e1, e0); - #else - return simde_mm_set_epi64(e0, e1); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setr_pd(e1, e0); - #else - return simde_mm_set_pd(e0, e1); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_setzero_pd (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setzero_pd(); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_f64x2_const(0.0, 0.0)); - #else - return simde_mm_castsi128_pd(simde_mm_setzero_si128()); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setzero_pd() simde_mm_setzero_pd() -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_undefined_pd (void) { - simde__m128d_private r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) - r_.n = _mm_undefined_pd(); - #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m128d_to_private(simde_mm_setzero_pd()); - #endif - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_undefined_pd() simde_mm_undefined_pd() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_undefined_si128 (void) { - simde__m128i_private r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) - r_.n = _mm_undefined_si128(); - #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_undefined_si128() (simde_mm_undefined_si128()) -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) -HEDLEY_DIAGNOSTIC_POP -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_setone_pd (void) { - return simde_mm_castps_pd(simde_x_mm_setone_ps()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_setone_si128 (void) { - return simde_mm_castps_si128(simde_x_mm_setone_ps()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shuffle_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_wasm_v128( \ - wasm_i32x4_shuffle( \ - (simde_tmp_a_).wasm_v128, \ - (simde_tmp_a_).wasm_v128, \ - ((imm8) ) & 3, \ - ((imm8) >> 2) & 3, \ - ((imm8) >> 4) & 3, \ - ((imm8) >> 6) & 3)); })) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_shuffle_epi32(a, imm8) \ - (__extension__ ({ \ - const int32x4_t simde_mm_shuffle_epi32_a_ = simde__m128i_to_neon_i32(a); \ - int32x4_t simde_mm_shuffle_epi32_r_; \ - simde_mm_shuffle_epi32_r_ = vmovq_n_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, (imm8) & (0x3))); \ - simde_mm_shuffle_epi32_r_ = vsetq_lane_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, ((imm8) >> 2) & 0x3), simde_mm_shuffle_epi32_r_, 1); \ - simde_mm_shuffle_epi32_r_ = vsetq_lane_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, ((imm8) >> 4) & 0x3), simde_mm_shuffle_epi32_r_, 2); \ - simde_mm_shuffle_epi32_r_ = vsetq_lane_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, ((imm8) >> 6) & 0x3), simde_mm_shuffle_epi32_r_, 3); \ - vreinterpretq_s64_s32(simde_mm_shuffle_epi32_r_); \ - })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 16, \ - (simde_tmp_a_).i32, \ - (simde_tmp_a_).i32, \ - ((imm8) ) & 3, \ - ((imm8) >> 2) & 3, \ - ((imm8) >> 4) & 3, \ - ((imm8) >> 6) & 3) }); })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1]; - r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1]; - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - #define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \ - simde__m128d_from_private((simde__m128d_private) { .f64 = \ - SIMDE_SHUFFLE_VECTOR_(64, 16, \ - simde__m128d_to_private(a).f64, \ - simde__m128d_to_private(b).f64, \ - (((imm8) ) & 1), \ - (((imm8) >> 1) & 1) + 2) }); })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) { - r_.i16[i] = a_.i16[i]; - } - for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_shufflehi_epi16(a, imm8) \ - (__extension__ ({ \ - int16x8_t simde_mm_shufflehi_epi16_a_ = simde__m128i_to_neon_i16(a); \ - int16x8_t simde_mm_shufflehi_epi16_r_ = simde_mm_shufflehi_epi16_a_; \ - simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) ) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 4); \ - simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) >> 2) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 5); \ - simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) >> 4) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 6); \ - simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) >> 6) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 7); \ - simde__m128i_from_neon_i16(simde_mm_shufflehi_epi16_r_); \ - })) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .wasm_v128 = \ - wasm_i16x8_shuffle( \ - (simde_tmp_a_).wasm_v128, \ - (simde_tmp_a_).wasm_v128, \ - 0, 1, 2, 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 16, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - 0, 1, 2, 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) { - r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)]; - } - SIMDE_VECTORIZE - for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_shufflelo_epi16(a, imm8) \ - simde__m128i_from_wasm_v128( \ - wasm_i16x8_shuffle( \ - simde__m128i_to_wasm_v128((a)), \ - wasm_i16x8_splat(0), \ - (((imm8) & 0x03) ), \ - (((imm8) & 0x0c) >> 2), \ - (((imm8) & 0x30) >> 4), \ - (((imm8) & 0xc0) >> 6), \ - 4, 5, 6, 7)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_shufflelo_epi16(a, imm8) \ - (__extension__({ \ - int16x8_t simde_mm_shufflelo_epi16_a_ = simde__m128i_to_neon_i16(a); \ - int16x8_t simde_mm_shufflelo_epi16_r_ = simde_mm_shufflelo_epi16_a_; \ - simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) ) & 0x3)), simde_mm_shufflelo_epi16_r_, 0); \ - simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) >> 2) & 0x3)), simde_mm_shufflelo_epi16_r_, 1); \ - simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) >> 4) & 0x3)), simde_mm_shufflelo_epi16_r_, 2); \ - simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) >> 6) & 0x3)), simde_mm_shufflelo_epi16_r_, 3); \ - simde__m128i_from_neon_i16(simde_mm_shufflelo_epi16_r_); \ - })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 16, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3), \ - 4, 5, 6, 7) }); })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sll_epi16(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - if (count_.u64[0] > 15) - return simde_mm_setzero_si128(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = (a_.u16 << count_.u64[0]); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0]))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0])); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sll_epi32(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - if (count_.u64[0] > 31) - return simde_mm_setzero_si128(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = (a_.u32 << count_.u64[0]); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0]))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0])); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sll_epi64(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - if (count_.u64[0] > 63) - return simde_mm_setzero_si128(); - - const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, s)) : wasm_i64x2_const(0,0); - #else - #if !defined(SIMDE_BUG_GCC_94488) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] << s; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sqrt_pd (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sqrt_pd(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsqrtq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_sqrt(a_.altivec_f64); - #elif defined(simde_math_sqrt) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sqrt(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sqrt_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_sqrt_pd(b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_sqrt_pd(simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_sqrt) - r_.f64[0] = simde_math_sqrt(b_.f64[0]); - r_.f64[1] = a_.f64[1]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srl_epi16(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0])); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srl_epi32(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0])); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srl_epi64(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0])); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - #if !defined(SIMDE_BUG_GCC_94488) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srai_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - /* MSVC requires a range of (0, 255). */ - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - const int cnt = (imm8 & ~15) ? 15 : imm8; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srai_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - /* MSVC requires a range of (0, 255). */ - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - const int cnt = (imm8 & ~31) ? 31 : imm8; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sra_epi16(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0])); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32) - return _mm_sra_epi32(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_slli_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 15))) { - return simde_mm_setzero_si128(); - } - - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); - #else - const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_slli_epi16(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_i16( \ - ((imm8) > 15) ? \ - vandq_s16(simde__m128i_to_neon_i16(a), vdupq_n_s16(0)) : \ - vshlq_n_s16(simde__m128i_to_neon_i16(a), ((imm8) & 15)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_slli_epi16(a, imm8) \ - ((imm8 < 16) ? wasm_i16x8_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0)) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #define simde_mm_slli_epi16(a, imm8) \ - ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_slli_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 31))) { - return simde_mm_setzero_si128(); - } - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << imm8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] << (imm8 & 0xff); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_slli_epi32(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_i32( \ - ((imm8) > 31) ? \ - vandq_s32(simde__m128i_to_neon_i32(a), vdupq_n_s32(0)) : \ - vshlq_n_s32(simde__m128i_to_neon_i32(a), ((imm8) & 31)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_slli_epi32(a, imm8) \ - ((imm8 < 32) ? wasm_i32x4_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0)) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #define simde_mm_slli_epi32(a, imm8) \ - (__extension__ ({ \ - simde__m128i ret; \ - if ((imm8) <= 0) { \ - ret = a; \ - } else if ((imm8) > 31) { \ - ret = simde_mm_setzero_si128(); \ - } else { \ - ret = simde__m128i_from_altivec_i32( \ - vec_sl(simde__m128i_to_altivec_i32(a), \ - vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \ - } \ - ret; \ - })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_slli_epi64 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 63))) { - return simde_mm_setzero_si128(); - } - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << imm8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] << (imm8 & 0xff); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_slli_epi64(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_i64( \ - ((imm8) > 63) ? \ - vandq_s64(simde__m128i_to_neon_i64(a), vdupq_n_s64(0)) : \ - vshlq_n_s64(simde__m128i_to_neon_i64(a), ((imm8) & 63)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_slli_epi64(a, imm8) \ - ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srli_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 15))) { - return simde_mm_setzero_si128(); - } - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> (imm8 & 0xff); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_srli_epi16(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_u16( \ - ((imm8) > 15) ? \ - vandq_u16(simde__m128i_to_neon_u16(a), vdupq_n_u16(0)) : \ - vshrq_n_u16(simde__m128i_to_neon_u16(a), ((imm8) & 15) | (((imm8) & 15) == 0)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_srli_epi16(a, imm8) \ - ((imm8 < 16) ? wasm_u16x8_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0)) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #define simde_mm_srli_epi16(a, imm8) \ - ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srli_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 31))) { - return simde_mm_setzero_si128(); - } - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> (imm8 & 0xff); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_srli_epi32(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_u32( \ - ((imm8) > 31) ? \ - vandq_u32(simde__m128i_to_neon_u32(a), vdupq_n_u32(0)) : \ - vshrq_n_u32(simde__m128i_to_neon_u32(a), ((imm8) & 31) | (((imm8) & 31) == 0)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_srli_epi32(a, imm8) \ - ((imm8 < 32) ? wasm_u32x4_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0)) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #define simde_mm_srli_epi32(a, imm8) \ - (__extension__ ({ \ - simde__m128i ret; \ - if ((imm8) <= 0) { \ - ret = a; \ - } else if ((imm8) > 31) { \ - ret = simde_mm_setzero_si128(); \ - } else { \ - ret = simde__m128i_from_altivec_i32( \ - vec_sr(simde__m128i_to_altivec_i32(a), \ - vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \ - } \ - ret; \ - })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srli_epi64 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - if (HEDLEY_UNLIKELY((imm8 & 63) != imm8)) - return simde_mm_setzero_si128(); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8)); - #else - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488) - r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> imm8; - } - #endif - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_srli_epi64(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_u64( \ - ((imm8) > 63) ? \ - vandq_u64(simde__m128i_to_neon_u64(a), vdupq_n_u64(0)) : \ - vshrq_n_u64(simde__m128i_to_neon_u64(a), ((imm8) & 63) | (((imm8) & 63) == 0)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_srli_epi64(a, imm8) \ - ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_store_pd(mem_addr, a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_store1_pd(mem_addr, a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0)); - #else - mem_addr[0] = a_.f64[0]; - mem_addr[1] = a_.f64[0]; - #endif - #endif -} -#define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) - #define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_store_sd(mem_addr, a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - const simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0); - simde_memcpy(mem_addr, &v, sizeof(v)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int64_t v = vgetq_lane_s64(a_.neon_i64, 0); - simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); - #else - simde_float64 v = a_.f64[0]; - simde_memcpy(mem_addr, &v, sizeof(simde_float64)); - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_, sizeof(a_)); - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void - simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storeh_pd(mem_addr, a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - *mem_addr = vgetq_lane_f64(a_.neon_f64, 1); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); - #else - *mem_addr = a_.f64[1]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - int64_t tmp; - - /* memcpy to prevent aliasing, tmp because we can't take the - * address of a vector element. */ - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - tmp = vgetq_lane_s64(a_.neon_i64, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - #endif - tmp = vec_extract(a_.altivec_i64, 0); - #else - tmp = a_.i64[0]; - #endif - - simde_memcpy(mem_addr, &tmp, sizeof(tmp)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storel_pd(mem_addr, a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), simde__m128d_to_wasm_v128(a), 0); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - simde_float64 tmp; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - tmp = vgetq_lane_f64(a_.neon_f64, 0); - #else - tmp = a_.f64[0]; - #endif - simde_memcpy(mem_addr, &tmp, sizeof(tmp)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storer_pd(mem_addr, a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), vextq_s64(a_.neon_i64, a_.neon_i64, 1)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - a_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 0); - simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_)); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0); - simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_)); - #else - mem_addr[0] = a_.f64[1]; - mem_addr[1] = a_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storeu_pd(mem_addr, a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_si16 (void* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) - _mm_storeu_si16(mem_addr, a); - #else - int16_t val = simde_x_mm_cvtsi128_si16(a); - simde_memcpy(mem_addr, &val, sizeof(val)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_si16(mem_addr, a) simde_mm_storeu_si16(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) - _mm_storeu_si32(mem_addr, a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store32_lane(mem_addr, simde__m128i_to_wasm_v128(a), 0); - #else - int32_t val = simde_mm_cvtsi128_si32(a); - simde_memcpy(mem_addr, &val, sizeof(val)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_si32(mem_addr, a) simde_mm_storeu_si32(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_si64 (void* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) - _mm_storeu_si64(mem_addr, a); - #else - int64_t val = simde_mm_cvtsi128_si64(a); - simde_memcpy(mem_addr, &val, sizeof(val)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_si64(mem_addr, a) simde_mm_storeu_si64(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_stream_pd(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ - defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A64V8_NATIVE) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_mm_store_pd(mem_addr, a); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ - defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_mm_store_si128(mem_addr, a); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_stream_si32(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_lane_s32(mem_addr, vdupq_n_s32(a), 0); - #else - *mem_addr = a; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION) - _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1_s64(mem_addr, vdup_n_s64(a)); - #else - *mem_addr = a; - #endif -} -#define simde_mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(mem_addr, a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a) - #define _mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 - b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] - b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 - b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] - b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 - b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] - b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 - b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] - b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 - b_.u32; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] - b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sub_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 - b_.f64; - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] - b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sub_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_sub_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_sub_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.f64[0] = a_.f64[0] - b_.f64[0]; - r_.f64[1] = a_.f64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sub_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sub_si64(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 - b_.i64; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vsub_s64(a_.neon_i64, b_.neon_i64); - #else - r_.i64[0] = a_.i64[0] - b_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_subs_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_sub_sat(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_subs_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_subs_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_sub_sat(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_subs_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_subs_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_sub_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_subs_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_subs_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_sub_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_subs_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomieq_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan)))); - uint64x2_t a_eq_b = vceqq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] == b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] == b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomige_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan); - uint64x2_t a_ge_b = vcgeq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] >= b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] >= b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomigt_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan); - uint64x2_t a_gt_b = vcgtq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] > b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] > b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomile_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan)))); - uint64x2_t a_le_b = vcleq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] <= b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] <= b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomilt_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan)))); - uint64x2_t a_lt_b = vcltq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] < b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] < b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomineq_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan); - uint64x2_t a_neq_b = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a_.neon_f64, b_.neon_f64)))); - r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] != b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] != b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_lfence (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_lfence(); - #else - simde_mm_sfence(); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_lfence() simde_mm_lfence() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_mfence (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_mfence(); - #else - simde_mm_sfence(); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mfence() simde_mm_mfence() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16)); - int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16)); - int8x8x2_t result = vzip_s8(a1, b1); - r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) { - r_.i8[(i * 2)] = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)]; - r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4_t a1 = vget_high_s16(a_.neon_i16); - int16x4_t b1 = vget_high_s16(b_.neon_i16); - int16x4x2_t result = vzip_s16(a1, b1); - r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 4, 12, 5, 13, 6, 14, 7, 15); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) { - r_.i16[(i * 2)] = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)]; - r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x2_t a1 = vget_high_s32(a_.neon_i32); - int32x2_t b1 = vget_high_s32(b_.neon_i32); - int32x2x2_t result = vzip_s32(a1, b1); - r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) { - r_.i32[(i * 2)] = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)]; - r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x1_t a_h = vget_high_s64(a_.neon_i64); - int64x1_t b_h = vget_high_s64(b_.neon_i64); - r_.neon_i64 = vcombine_s64(a_h, b_h); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) { - r_.i64[(i * 2)] = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)]; - r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vzip2q_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) { - r_.f64[(i * 2)] = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)]; - r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16)); - int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16)); - int8x8x2_t result = vzip_s8(a1, b1); - r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) { - r_.i8[(i * 2)] = a_.i8[i]; - r_.i8[(i * 2) + 1] = b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4_t a1 = vget_low_s16(a_.neon_i16); - int16x4_t b1 = vget_low_s16(b_.neon_i16); - int16x4x2_t result = vzip_s16(a1, b1); - r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 8, 1, 9, 2, 10, 3, 11); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) { - r_.i16[(i * 2)] = a_.i16[i]; - r_.i16[(i * 2) + 1] = b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x2_t a1 = vget_low_s32(a_.neon_i32); - int32x2_t b1 = vget_low_s32(b_.neon_i32); - int32x2x2_t result = vzip_s32(a1, b1); - r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) { - r_.i32[(i * 2)] = a_.i32[i]; - r_.i32[(i * 2) + 1] = b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x1_t a_l = vget_low_s64(a_.neon_i64); - int64x1_t b_l = vget_low_s64(b_.neon_i64); - r_.neon_i64 = vcombine_s64(a_l, b_l); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) { - r_.i64[(i * 2)] = a_.i64[i]; - r_.i64[(i * 2) + 1] = b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vzip1q_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) { - r_.f64[(i * 2)] = a_.f64[i]; - r_.f64[(i * 2) + 1] = b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_negate_pd(simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0)) - r_.altivec_f64 = vec_neg(a_.altivec_f64); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vnegq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128); - #elif defined(SIMDE_VECTOR_NEGATE) - r_.f64 = -a_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -a_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_xor_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_xor_si128(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_xor(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_not_si128 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_ternarylogic_epi32(a, a, a, 0x55); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmvnq_s32(a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~(a_.i32f[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -#define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y)) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE2_H) */ diff --git a/extern/simde/x86/sse3.h b/extern/simde/x86/sse3.h deleted file mode 100644 index db2683c30..000000000 --- a/extern/simde/x86/sse3.h +++ /dev/null @@ -1,515 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_SSE3_H) -#define SIMDE_X86_SSE3_H - -#include "sse2.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_deinterleaveeven_epi16 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vuzp1q_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = t.val[0]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6, 8, 10, 12, 14); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.i16[i] = a_.i16[2 * i]; - r_.i16[i + halfway_point] = b_.i16[2 * i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_deinterleaveodd_epi16 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vuzp2q_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = t.val[1]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7, 9, 11, 13, 15); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.i16[i] = a_.i16[2 * i + 1]; - r_.i16[i + halfway_point] = b_.i16[2 * i + 1]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_deinterleaveeven_epi32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vuzp1q_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i32 = t.val[0]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 2, 4, 6); - #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.i32[i] = a_.i32[2 * i]; - r_.i32[i + halfway_point] = b_.i32[2 * i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_deinterleaveodd_epi32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vuzp2q_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i32 = t.val[1]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 1, 3, 5, 7); - #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.i32[i] = a_.i32[2 * i + 1]; - r_.i32[i + halfway_point] = b_.i32[2 * i + 1]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_deinterleaveeven_ps (simde__m128 a, simde__m128 b) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vuzp1q_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32); - r_.neon_f32 = t.val[0]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 2, 4, 6); - #else - const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.f32[i] = a_.f32[2 * i]; - r_.f32[i + halfway_point] = b_.f32[2 * i]; - } - #endif - - return simde__m128_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_deinterleaveodd_ps (simde__m128 a, simde__m128 b) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vuzp2q_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32); - r_.neon_f32 = t.val[1]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 1, 3, 5, 7); - #else - const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.f32[i] = a_.f32[2 * i + 1]; - r_.f32[i + halfway_point] = b_.f32[2 * i + 1]; - } - #endif - - return simde__m128_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_deinterleaveeven_pd (simde__m128d a, simde__m128d b) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vuzp1q_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); - #else - const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.f64[i] = a_.f64[2 * i]; - r_.f64[i + halfway_point] = b_.f64[2 * i]; - } - #endif - - return simde__m128d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_deinterleaveodd_pd (simde__m128d a, simde__m128d b) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vuzp2q_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); - #else - const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.f64[i] = a_.f64[2 * i + 1]; - r_.f64[i + halfway_point] = b_.f64[2 * i + 1]; - } - #endif - - return simde__m128d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_addsub_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_addsub_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t rs = vsubq_f64(a_.neon_f64, b_.neon_f64); - float64x2_t ra = vaddq_f64(a_.neon_f64, b_.neon_f64); - return vcombine_f64(vget_low_f64(rs), vget_high_f64(ra)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64 - b_.f64, a_.f64 + b_.f64, 0, 3); - #else - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = a_.f64[ i ] - b_.f64[ i ]; - r_.f64[1 + i] = a_.f64[1 + i] + b_.f64[1 + i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_addsub_pd(a, b) simde_mm_addsub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_addsub_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_addsub_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float32x4_t rs = vsubq_f32(a_.neon_f32, b_.neon_f32); - float32x4_t ra = vaddq_f32(a_.neon_f32, b_.neon_f32); - return vtrn2q_f32(vreinterpretq_f32_s32(vrev64q_s32(vreinterpretq_s32_f32(rs))), ra); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32 - b_.f32, a_.f32 + b_.f32, 0, 5, 2, 7); - #else - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = a_.f32[ i ] - b_.f32[ i ]; - r_.f32[1 + i] = a_.f32[1 + i] + b_.f32[1 + i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_addsub_ps(a, b) simde_mm_addsub_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_hadd_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_hadd_pd(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128d_from_neon_f64(vpaddq_f64(simde__m128d_to_neon_f64(a), simde__m128d_to_neon_f64(b))); - #else - return simde_mm_add_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_pd(a, b) simde_mm_hadd_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_hadd_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_hadd_ps(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128_from_neon_f32(vpaddq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b)); - return simde__m128_from_neon_f32(vaddq_f32(t.val[0], t.val[1])); - #else - return simde_mm_add_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_ps(a, b) simde_mm_hadd_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_hsub_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_hsub_pd(a, b); - #else - return simde_mm_sub_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_pd(a, b) simde_mm_hsub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_hsub_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_hsub_ps(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b)); - return simde__m128_from_neon_f32(vaddq_f32(t.val[0], vnegq_f32(t.val[1]))); - #else - return simde_mm_sub_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_ps(a, b) simde_mm_hsub_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_lddqu_si128 (simde__m128i const* mem_addr) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_lddqu_si128(mem_addr); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_lddqu_si128(mem_addr) simde_mm_lddqu_si128(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loaddup_pd (simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_loaddup_pd(mem_addr); - #else - simde__m128d_private r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdupq_n_f64(*mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vdupq_n_s64(*HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); - #else - r_.f64[0] = *mem_addr; - r_.f64[1] = *mem_addr; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_loaddup_pd(mem_addr) simde_mm_loaddup_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_movedup_pd (simde__m128d a) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_movedup_pd(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0); - #else - r_.f64[0] = a_.f64[0]; - r_.f64[1] = a_.f64[0]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_movedup_pd(a) simde_mm_movedup_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_movehdup_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_movehdup_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vtrn2q_f32(a_.neon_f32, a_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 1, 3, 3); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 1, 1, 3, 3); - #else - r_.f32[0] = a_.f32[1]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[3]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_movehdup_ps(a) simde_mm_movehdup_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_moveldup_ps (simde__m128 a) { - #if defined(SIMDE__SSE3_NATIVE) - return _mm_moveldup_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vtrn1q_f32(a_.neon_f32, a_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 2, 2); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 2, 2); - #else - r_.f32[0] = a_.f32[0]; - r_.f32[1] = a_.f32[0]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[2]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_moveldup_ps(a) simde_mm_moveldup_ps(a) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE3_H) */ diff --git a/extern/simde/x86/sse4.1.h b/extern/simde/x86/sse4.1.h deleted file mode 100644 index 15a197b95..000000000 --- a/extern/simde/x86/sse4.1.h +++ /dev/null @@ -1,2367 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -#include "sse.h" -#if !defined(SIMDE_X86_SSE4_1_H) -#define SIMDE_X86_SSE4_1_H - -#include "ssse3.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if !defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_blend_epi16(a, b, imm8) \ - (__extension__ ({ \ - simde__m128i_private \ - simde_mm_blend_epi16_a_ = simde__m128i_to_private(a), \ - simde_mm_blend_epi16_b_ = simde__m128i_to_private(b), \ - simde_mm_blend_epi16_r_; \ - \ - simde_mm_blend_epi16_r_.i16 = \ - SIMDE_SHUFFLE_VECTOR_( \ - 16, 16, \ - simde_mm_blend_epi16_a_.i16, \ - simde_mm_blend_epi16_b_.i16, \ - ((imm8) & (1 << 0)) ? 8 : 0, \ - ((imm8) & (1 << 1)) ? 9 : 1, \ - ((imm8) & (1 << 2)) ? 10 : 2, \ - ((imm8) & (1 << 3)) ? 11 : 3, \ - ((imm8) & (1 << 4)) ? 12 : 4, \ - ((imm8) & (1 << 5)) ? 13 : 5, \ - ((imm8) & (1 << 6)) ? 14 : 6, \ - ((imm8) & (1 << 7)) ? 15 : 7 \ - ); \ - \ - simde__m128i_from_private(simde_mm_blend_epi16_r_); \ - })) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blend_epi16 - #define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; - } - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_blend_pd(a, b, imm8) \ - (__extension__ ({ \ - simde__m128d_private \ - simde_mm_blend_pd_a_ = simde__m128d_to_private(a), \ - simde_mm_blend_pd_b_ = simde__m128d_to_private(b), \ - simde_mm_blend_pd_r_; \ - \ - simde_mm_blend_pd_r_.f64 = \ - SIMDE_SHUFFLE_VECTOR_( \ - 64, 16, \ - simde_mm_blend_pd_a_.f64, \ - simde_mm_blend_pd_b_.f64, \ - ((imm8) & (1 << 0)) ? 2 : 0, \ - ((imm8) & (1 << 1)) ? 3 : 1 \ - ); \ - \ - simde__m128d_from_private(simde_mm_blend_pd_r_); \ - })) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blend_pd - #define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; - } - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_blend_ps(a, b, imm8) \ - (__extension__ ({ \ - simde__m128_private \ - simde_mm_blend_ps_a_ = simde__m128_to_private(a), \ - simde_mm_blend_ps_b_ = simde__m128_to_private(b), \ - simde_mm_blend_ps_r_; \ - \ - simde_mm_blend_ps_r_.f32 = \ - SIMDE_SHUFFLE_VECTOR_( \ - 32, 16, \ - simde_mm_blend_ps_a_.f32, \ - simde_mm_blend_ps_b_.f32, \ - ((imm8) & (1 << 0)) ? 4 : 0, \ - ((imm8) & (1 << 1)) ? 5 : 1, \ - ((imm8) & (1 << 2)) ? 6 : 2, \ - ((imm8) & (1 << 3)) ? 7 : 3 \ - ); \ - \ - simde__m128_from_private(simde_mm_blend_ps_r_); \ - })) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blend_ps - #define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_epi8(a, b, mask); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); - return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Use a signed shift right to create a mask with the sign bit */ - mask_.neon_i8 = vshrq_n_s8(mask_.neon_i8, 7); - r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_i8x16_shr(mask_.wasm_v128, 7); - r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128, m); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */ - #if defined(HEDLEY_INTEL_VERSION_CHECK) - __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - mask_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(mask_.i8), mask_.i8 < z); - #else - mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1; - #endif - - r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - int8_t m = mask_.i8[i] >> 7; - r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blendv_epi8 - #define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) { - #if defined(SIMDE_X86_SSE2_NATIVE) - mask = simde_mm_srai_epi16(mask, 15); - return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128())); - r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - #if defined(HEDLEY_INTEL_VERSION_CHECK) - __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 }; - mask_.i16 = mask_.i16 < z; - #else - mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1; - #endif - - r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - int16_t m = mask_.i16[i] >> 15; - r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask))); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128())); - r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_i32x4_shr(mask_.wasm_v128, 31); - r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - #if defined(HEDLEY_INTEL_VERSION_CHECK) - __typeof__(mask_.i32) z = { 0, 0, 0, 0 }; - mask_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(mask_.i32), mask_.i32 < z); - #else - mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1; - #endif - - r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - int32_t m = mask_.i32[i] >> 31; - r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask))); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - mask_.neon_u64 = vcltq_s64(mask_.neon_i64, vdupq_n_s64(UINT64_C(0))); - r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_i64x2_shr(mask_.wasm_v128, 63); - r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m)); - #elif (defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_BUG_CLANG_46770)) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i64 = vec_sel(a_.altivec_i64, b_.altivec_i64, vec_cmplt(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(signed long long, 0)))); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(signed long long) selector = vec_sra(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63))); - r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), selector)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - #if defined(HEDLEY_INTEL_VERSION_CHECK) - __typeof__(mask_.i64) z = { 0, 0 }; - mask_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(mask_.i64), mask_.i64 < z); - #else - mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1; - #endif - - r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - int64_t m = mask_.i64[i] >> 63; - r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_pd(a, b, mask); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m_ = wasm_i64x2_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 63); - return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); - #else - return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask))); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blendv_pd - #define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_ps(a, b, mask); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m_ = wasm_i32x4_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 31); - return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); - #else - return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask))); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blendv_ps - #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_round_pd (simde__m128d a, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - /* For architectures which lack a current direction SIMD instruction. */ - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) - rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; - #endif - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_CUR_DIRECTION: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndiq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); - #elif defined(simde_math_nearbyint) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_nearbyint(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEAREST_INT: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndaq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); - #elif defined(simde_math_roundeven) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_roundeven(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEG_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndmq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_floor(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); - } - #endif - break; - - case SIMDE_MM_FROUND_TO_POS_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndpq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_ceil(a_.wasm_v128); - #elif defined(simde_math_ceil) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_ZERO: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_trunc(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); - } - #endif - break; - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_round_pd - #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_ceil_pd (simde__m128d a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_f64x2_ceil(simde__m128d_to_wasm_v128(a))); - #endif - return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF); -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_ceil_pd - #define _mm_ceil_pd(a) simde_mm_ceil_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_ceil_ps (simde__m128 a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128_from_wasm_v128(wasm_f32x4_ceil(simde__m128_to_wasm_v128(a))); - #endif - return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF); -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_ceil_ps - #define _mm_ceil_ps(a) simde_mm_ceil_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_ceil_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_ceil_sd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_ceilf) - r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0]))); - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_ceil_sd - #define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_ceil_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_ceil_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_ceil_ps(b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_ceil_ps(simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(simde_math_ceilf) - r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0]))); - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_ceil_ss - #define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cmpeq_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) */ - uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32); - uint32x4_t swapped = vrev64q_u32(cmp); - r_.neon_u32 = vandq_u32(cmp, swapped); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 == b_.i64); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpeq_epi64 - #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi8_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi8_epi16(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - r_.neon_i16 = s16x8; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_extend_low_i8x16(a_.wasm_v128); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, - -1, 0, -1, 1, -1, 2, -1, 3, - -1, 4, -1, 5, -1, 6, -1, 7)); - r_.i16 >>= 8; - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi8_epi16 - #define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi8_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi8_epi32(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i tmp = _mm_unpacklo_epi8(a, a); - tmp = _mm_unpacklo_epi16(tmp, tmp); - return _mm_srai_epi32(tmp, 24); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ - r_.neon_i32 = s32x4; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, - -1, -1, -1, 0, -1, -1, -1, 1, - -1, -1, -1, 2, -1, -1, -1, 3)); - r_.i32 >>= 24; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi8_epi32 - #define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi8_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi8_epi64(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx xxBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ - int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ - r_.neon_i64 = s64x2; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t extra = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); - v128_t sign = wasm_i32x4_gt(wasm_i64x2_const(0, 0), extra); - r_.wasm_v128 = wasm_i32x4_shuffle(extra, sign, 0, 4, 1, 5); - #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - /* Disabled on x86 due to lack of 64-bit arithmetic shift until - * until AVX-512 (at which point we would be using the native - * _mm_cvtepi_epi64 anyways). */ - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, - -1, -1, -1, -1, -1, -1, -1, 0, - -1, -1, -1, -1, -1, -1, -1, 1)); - r_.i64 >>= 56; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi8_epi64 - #define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu8_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu8_epi16(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi8(a, _mm_setzero_si128()); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - r_.neon_u16 = u16x8; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_extend_low_u8x16(a_.wasm_v128); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.i8) z = { 0, }; - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, - 0, 16, 1, 17, 2, 18, 3, 19, - 4, 20, 5, 21, 6, 22, 7, 23)); - #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__)) - SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu8_epi16 - #define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu8_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu8_epi32(a); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - __m128i s = _mm_set_epi8( - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x03), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x02), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x01), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x00)); - return _mm_shuffle_epi8(a, s); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i z = _mm_setzero_si128(); - return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ - r_.neon_u32 = u32x4; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(a_.wasm_v128)); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.i8) z = { 0, }; - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, - 0, 17, 18, 19, 1, 21, 22, 23, - 2, 25, 26, 27, 3, 29, 30, 31)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu8_epi32 - #define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu8_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu8_epi64(a); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - __m128i s = _mm_set_epi8( - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x01), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x00)); - return _mm_shuffle_epi8(a, s); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i z = _mm_setzero_si128(); - return _mm_unpacklo_epi32(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z), z); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx xxBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ - uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ - r_.neon_u64 = u64x2; - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.i8) z = { 0, }; - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, - 0, 17, 18, 19, 20, 21, 22, 23, - 1, 25, 26, 27, 28, 29, 30, 31)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu8_epi64 - #define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi16_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi16_epi32(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(a_.wasm_v128); - #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3)); - r_.i32 >>= 16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi16_epi32 - #define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu16_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu16_epi32(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi16(a, _mm_setzero_si128()); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(a_.wasm_v128); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.u16) z = { 0, }; - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z, - 0, 9, 1, 11, 2, 13, 3, 15)); - #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__)) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu16_epi32 - #define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu16_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu16_epi64(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i z = _mm_setzero_si128(); - return _mm_unpacklo_epi32(_mm_unpacklo_epi16(a, z), z); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint16x8_t u16x8 = a_.neon_u16; /* xxxx xxxx xxxx 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ - uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ - r_.neon_u64 = u64x2; - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.u16) z = { 0, }; - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z, - 0, 9, 10, 11, - 1, 13, 14, 15)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu16_epi64 - #define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi16_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi16_epi64(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8_t s16x8 = a_.neon_i16; /* xxxx xxxx xxxx 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ - int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ - r_.neon_i64 = s64x2; - #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, - 8, 9, 10, 0, - 12, 13, 14, 1)); - r_.i64 >>= 48; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi16_epi64 - #define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi32_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi32_epi64(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i tmp = _mm_shuffle_epi32(a, 0x50); - tmp = _mm_srai_epi32(tmp, 31); - tmp = _mm_shuffle_epi32(tmp, 0xed); - return _mm_unpacklo_epi32(a, tmp); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32)); - #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, a_.i32, -1, 0, -1, 1)); - r_.i64 >>= 32; - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi32_epi64 - #define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu32_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu32_epi64(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi32(a, _mm_setzero_si128()); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32)); - #elif defined(SIMDE_VECTOR_SCALAR) && defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.u32) z = { 0, }; - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 1, 6)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu32_epi64 - #define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64); - - switch (imm8) { - case 0xff: - r_.neon_f64 = vaddq_f64(r_.neon_f64, vextq_f64(r_.neon_f64, r_.neon_f64, 1)); - break; - case 0x13: - r_.neon_f64 = vdupq_lane_f64(vget_low_f64(r_.neon_f64), 0); - break; - default: - { /* imm8 is a compile-time constant, so this all becomes just a load */ - uint64_t mask_data[] = { - (imm8 & (1 << 4)) ? ~UINT64_C(0) : UINT64_C(0), - (imm8 & (1 << 5)) ? ~UINT64_C(0) : UINT64_C(0), - }; - r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64))); - } - - r_.neon_f64 = vdupq_n_f64(vaddvq_f64(r_.neon_f64)); - - { - uint64_t mask_data[] = { - (imm8 & 1) ? ~UINT64_C(0) : UINT64_C(0), - (imm8 & 2) ? ~UINT64_C(0) : UINT64_C(0) - }; - r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64))); - } - break; - } - #else - simde_float64 sum = SIMDE_FLOAT64_C(0.0); - - SIMDE_VECTORIZE_REDUCTION(+:sum) - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0; - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0; - } - #endif - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_dp_pd - #define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32); - - switch (imm8) { - case 0xff: - r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32)); - break; - case 0x7f: - r_.neon_f32 = vsetq_lane_f32(0, r_.neon_f32, 3); - r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32)); - break; - default: - { - { - uint32_t mask_data[] = { - (imm8 & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0) - }; - r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32))); - } - - r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32)); - - { - uint32_t mask_data[] = { - (imm8 & 1) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & 2) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & 4) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & 8) ? ~UINT32_C(0) : UINT32_C(0) - }; - r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32))); - } - } - break; - } - #else - simde_float32 sum = SIMDE_FLOAT32_C(0.0); - - SIMDE_VECTORIZE_REDUCTION(+:sum) - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0); - } - #endif - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #if defined(HEDLEY_MCST_LCC_VERSION) - #define simde_mm_dp_ps(a, b, imm8) (__extension__ ({ \ - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS \ - _mm_dp_ps((a), (b), (imm8)); \ - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ - })) - #else - #define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8) - #endif -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_dp_ps - #define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8) -#endif - -#if defined(simde_mm_extract_epi8) -# undef simde_mm_extract_epi8 -#endif -SIMDE_FUNCTION_ATTRIBUTES -int8_t -simde_mm_extract_epi8 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - (void) imm8; - #endif - return vec_extract(a_.altivec_i8, imm8); - #else - return a_.i8[imm8 & 15]; - #endif -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8) -# define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_neon_i8(a), imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_extract_epi8(a, imm8) wasm_u8x16_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_extract_epi8 - #define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8)) -#endif - -#if defined(simde_mm_extract_epi32) -# undef simde_mm_extract_epi32 -#endif -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_extract_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - (void) imm8; - #endif - return vec_extract(a_.altivec_i32, imm8); - #else - return a_.i32[imm8 & 3]; - #endif -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_neon_i32(a), imm8) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) -# define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_altivec_i32(a), imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_extract_epi32(a, imm8) wasm_i32x4_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_extract_epi32 - #define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8) -#endif - -#if defined(simde_mm_extract_epi64) -# undef simde_mm_extract_epi64 -#endif -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_extract_epi64 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - (void) imm8; - #endif - return vec_extract(a_.altivec_i64, imm8); - #else - return a_.i64[imm8 & 1]; - #endif -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64) -# define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_neon_i64(a), imm8) -#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) -# define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_altivec_i64(a), imm8)) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #undef _mm_extract_epi64 - #define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8) -#endif - -#if defined(simde_mm_extract_ps) -# undef simde_mm_extract_ps -#endif -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_extract_ps (simde__m128 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128_private - a_ = simde__m128_to_private(a); - - return a_.i32[imm8 & 3]; -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_extract_ps(a, imm8) _mm_extract_ps(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_neon_i32(a), imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_extract_ps(a, imm8) wasm_i32x4_extract_lane(simde__m128_to_wasm_v128((a)), (imm8) & 3) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_extract_ps - #define _mm_extract_ps(a, imm8) simde_mm_extract_ps(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_floor_pd (simde__m128d a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_f64x2_floor(simde__m128d_to_wasm_v128(a))); - #endif - return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF); -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_floor_pd - #define _mm_floor_pd(a) simde_mm_floor_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_floor_ps (simde__m128 a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128_from_wasm_v128(wasm_f32x4_floor(simde__m128_to_wasm_v128(a))); - #endif - return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF); -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_floor_ps - #define _mm_floor_ps(a) simde_mm_floor_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_floor_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_floor_sd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_floor) - r_.f64[0] = simde_math_floor(b_.f64[0]); - r_.f64[1] = a_.f64[1]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_floor_sd - #define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_floor_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_floor_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_floor_ps(b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_floor_ps(simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(simde_math_floorf) - r_.f32[0] = simde_math_floorf(b_.f32[0]); - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i]; - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_floor_ss - #define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128i_private - r_ = simde__m128i_to_private(a); - - r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i); - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - /* clang-3.8 returns an incompatible type, so we need the cast. MSVC - * can't handle the cast ("error C2440: 'type cast': cannot convert - * from '__m128i' to '__m128i'"). */ - #if defined(__clang__) - #define simde_mm_insert_epi8(a, i, imm8) HEDLEY_REINTERPRET_CAST(__m128i, _mm_insert_epi8(a, i, imm8)) - #else - #define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8) - #endif -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_neon_i8(a), imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i8x16_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15, HEDLEY_STATIC_CAST(int8_t, (i)))) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_insert_epi8 - #define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128i_private - r_ = simde__m128i_to_private(a); - - r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i); - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #if defined(__clang__) - #define simde_mm_insert_epi32(a, i, imm8) HEDLEY_REINTERPRET_CAST(__m128i, _mm_insert_epi32(a, i, imm8)) - #else - #define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8) - #endif -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_neon_i32(a), imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i32x4_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3, (i))) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_insert_epi32 - #define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - #if defined(SIMDE_BUG_GCC_94482) - simde__m128i_private - a_ = simde__m128i_to_private(a); - - switch(imm8) { - case 0: - return simde_mm_set_epi64x(a_.i64[1], i); - break; - case 1: - return simde_mm_set_epi64x(i, a_.i64[0]); - break; - default: - HEDLEY_UNREACHABLE(); - break; - } - #else - simde__m128i_private - r_ = simde__m128i_to_private(a); - - r_.i64[imm8] = i; - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64) -# define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_neon_i64(a), imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i64x2_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 1, (i))) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #undef _mm_insert_epi64 - #define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - float tmp1_ = b_.f32[(imm8 >> 6) & 3]; - a_.f32[(imm8 >> 4) & 3] = tmp1_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1 ) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i]; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_insert_ps - #define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI) - return _mm_max_epi8(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i m = _mm_cmpgt_epi8(a, b); - return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_max_epi8 - #define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI) - return _mm_max_epi32(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i m = _mm_cmpgt_epi32(a, b); - return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_max_epi32 - #define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_max_epu16(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881656284 */ - return _mm_add_epi16(b, _mm_subs_epu16(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_max_epu16 - #define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_max_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_max_epu32 - #define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI) - return _mm_min_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_min(a_.altivec_i8, b_.altivec_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_min_epi8 - #define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI) - return _mm_min_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_min(a_.altivec_i32, b_.altivec_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_min_epi32 - #define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_min_epu16(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881656284 */ - return _mm_sub_epi16(a, _mm_subs_epu16(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u16 = vec_min(a_.altivec_u16, b_.altivec_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_min_epu16 - #define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_min_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u32 = vec_min(a_.altivec_u32, b_.altivec_u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_min_epu32 - #define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_minpos_epu16 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_minpos_epu16(a); - #else - simde__m128i_private - r_ = simde__m128i_to_private(simde_mm_setzero_si128()), - a_ = simde__m128i_to_private(a); - - r_.u16[0] = UINT16_MAX; - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - if (a_.u16[i] < r_.u16[0]) { - r_.u16[0] = a_.u16[i]; - r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i); - } - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_minpos_epu16 - #define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - const int a_offset = imm8 & 4; - const int b_offset = (imm8 & 3) << 2; - -#if defined(simde_math_abs) - for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) { - r_.u16[i] = - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3]))); - } -#else - HEDLEY_UNREACHABLE(); -#endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_PGI_30107) -# define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_mpsadbw_epu8 - #define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_mul_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - // vmull_s32 upcasts instead of masking, so we downcast. - int32x2_t a_lo = vmovn_s64(a_.neon_i64); - int32x2_t b_lo = vmovn_s64(b_.neon_i64); - r_.neon_i64 = vmull_s32(a_lo, b_lo); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_make( - wasm_i32x4_extract_lane(a_.wasm_v128, 0) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 0)), - wasm_i32x4_extract_lane(a_.wasm_v128, 2) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 2))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = - HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) * - HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_mul_epi32 - #define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_mullo_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - (void) a_; - (void) b_; - r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_mul(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_mullo_epi32 - #define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmulq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 * b_.u32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] * b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_packus_epi32(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i max = _mm_set1_epi32(UINT16_MAX); - const __m128i tmpa = _mm_andnot_si128(_mm_srai_epi32(a, 31), a); - const __m128i tmpb = _mm_andnot_si128(_mm_srai_epi32(b, 31), b); - return - _mm_packs_epi32( - _mm_srai_epi32(_mm_slli_epi32(_mm_or_si128(tmpa, _mm_cmpgt_epi32(tmpa, max)), 16), 16), - _mm_srai_epi32(_mm_slli_epi32(_mm_or_si128(tmpb, _mm_cmpgt_epi32(tmpb, max)), 16), 16) - ); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #if defined(SIMDE_BUG_CLANG_46840) - r_.neon_u16 = vqmovun_high_s32(vreinterpret_s16_u16(vqmovun_s32(a_.neon_i32)), b_.neon_i32); - #else - r_.neon_u16 = vqmovun_high_s32(vqmovun_s32(a_.neon_i32), b_.neon_i32); - #endif - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = - vcombine_u16( - vqmovun_s32(a_.neon_i32), - vqmovun_s32(b_.neon_i32) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u16 = vec_packsu(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_narrow_i32x4(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int32_t v SIMDE_VECTOR(32) = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 1, 2, 3, 4, 5, 6, 7); - - v &= ~(v >> 31); - v |= HEDLEY_REINTERPRET_CAST(__typeof__(v), v > UINT16_MAX); - - SIMDE_CONVERT_VECTOR_(r_.i16, v); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - int32_t v = (i < (sizeof(a_.i32) / sizeof(a_.i32[0]))) ? a_.i32[i] : b_.i32[i & 3]; - r_.u16[i] = (v < 0) ? UINT16_C(0) : ((v > UINT16_MAX) ? UINT16_MAX : HEDLEY_STATIC_CAST(uint16_t, v)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_packus_epi32 - #define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m128d_private - r_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyint) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - case SIMDE_MM_FROUND_CUR_DIRECTION: - r_.f64[0] = simde_math_nearbyint(b_.f64[0]); - break; - #endif - - #if defined(simde_math_floor) - case SIMDE_MM_FROUND_TO_NEG_INF: - r_.f64[0] = simde_math_floor(b_.f64[0]); - break; - #endif - - #if defined(simde_math_ceil) - case SIMDE_MM_FROUND_TO_POS_INF: - r_.f64[0] = simde_math_ceil(b_.f64[0]); - break; - #endif - - #if defined(simde_math_trunc) - case SIMDE_MM_FROUND_TO_ZERO: - r_.f64[0] = simde_math_trunc(b_.f64[0]); - break; - #endif - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding) -#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_FAST_EXCEPTIONS) -# define simde_mm_round_sd(a, b, rounding) simde_mm_move_sd(a, simde_mm_round_pd(b, rounding)) -#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - #define simde_mm_round_sd(a, b, rounding) simde_mm_move_sd(a, simde_mm_round_pd(simde_x_mm_broadcastlow_pd(b), rounding)) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_round_sd - #define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m128_private - r_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyintf) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - case SIMDE_MM_FROUND_CUR_DIRECTION: - r_.f32[0] = simde_math_nearbyintf(b_.f32[0]); - break; - #endif - - #if defined(simde_math_floorf) - case SIMDE_MM_FROUND_TO_NEG_INF: - r_.f32[0] = simde_math_floorf(b_.f32[0]); - break; - #endif - - #if defined(simde_math_ceilf) - case SIMDE_MM_FROUND_TO_POS_INF: - r_.f32[0] = simde_math_ceilf(b_.f32[0]); - break; - #endif - - #if defined(simde_math_truncf) - case SIMDE_MM_FROUND_TO_ZERO: - r_.f32[0] = simde_math_truncf(b_.f32[0]); - break; - #endif - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding) -#elif SIMDE_NATURAL_VECTOR_SIZE > 0 && defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_round_ss(a, b, rounding) simde_mm_move_ss((a), simde_mm_round_ps((b), (rounding))) -#elif SIMDE_NATURAL_VECTOR_SIZE > 0 - #define simde_mm_round_ss(a, b, rounding) simde_mm_move_ss((a), simde_mm_round_ps(simde_x_mm_broadcastlow_ps(b), (rounding))) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_round_ss - #define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_stream_load_si128 (const simde__m128i* mem_addr) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr)); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_load) && ( \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - return __builtin_nontemporal_load(mem_addr); - #else - return simde_mm_load_si128(mem_addr); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_stream_load_si128 - #define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_test_all_ones (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_test_all_ones(a); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - int r; - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r = vec_all_eq(a_.altivec_i32, vec_splats(~0)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(a_.wasm_v128, 0) & wasm_i64x2_extract_lane(a_.wasm_v128, 1)) == 0xFFFFFFFFFFFFFFFFull; - #else - int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); - - SIMDE_VECTORIZE_REDUCTION(&:r_) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r_ &= a_.i32f[i]; - } - - r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0)); - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_test_all_ones - #define _mm_test_all_ones(a) simde_mm_test_all_ones(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_test_all_zeros(a, mask); - #else - simde__m128i_private tmp_ = simde__m128i_to_private(simde_mm_and_si128(a, mask)); - int r; - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r = vec_all_eq(tmp_.altivec_i32, vec_splats(0)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r = !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = (wasm_i64x2_extract_lane(tmp_.wasm_v128, 0) | wasm_i64x2_extract_lane(tmp_.wasm_v128, 1)) == 0; - #else - int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0); - - SIMDE_VECTORIZE_REDUCTION(|:r_) - for (size_t i = 0 ; i < (sizeof(tmp_.i32f) / sizeof(tmp_.i32f[0])) ; i++) { - r_ |= tmp_.i32f[i]; - } - - r = !r_; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_test_all_zeros - #define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_test_mix_ones_zeros(a, mask); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x2_t s640 = vandq_s64(a_.neon_i64, mask_.neon_i64); - int64x2_t s641 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a_.neon_i64))), mask_.neon_i64); - return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_v128_and(a_.wasm_v128, mask_.wasm_v128); - long long c0 = wasm_i64x2_extract_lane(m, 0); - long long c1 = wasm_i64x2_extract_lane(m, 1); - long long ones = c0 | c1; - long long zeros = ~(c0 & c1); - return ones && zeros; - #else - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) - if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0)) - return 1; - - return 0; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_test_mix_ones_zeros - #define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testc_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_testc_si128(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x2_t s64 = vbicq_s64(b_.neon_i64, a_.neon_i64); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; - #else - int_fast32_t r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r |= ~a_.i32f[i] & b_.i32f[i]; - } - - return HEDLEY_STATIC_CAST(int, !r); - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_testc_si128 - #define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_testnzc_si128(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x2_t s640 = vandq_s64(b_.neon_i64, a_.neon_i64); - int64x2_t s641 = vbicq_s64(b_.neon_i64, a_.neon_i64); - return !( !(vgetq_lane_s64(s641, 0) || vgetq_lane_s64(s641, 1)) \ - || !(vgetq_lane_s64(s640, 0) || vgetq_lane_s64(s640, 1)) ); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m1 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - v128_t m2 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - return (wasm_i64x2_extract_lane(m1, 0) | wasm_i64x2_extract_lane(m1, 1)) \ - && (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1)); - #else - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0)) - return 1; - } - - return 0; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_testnzc_si128 - #define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testz_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_testz_si128(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x2_t s64 = vandq_s64(a_.neon_i64, b_.neon_i64); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; - #elif defined(SIMDE_HAVE_INT128_) - if ((a_.u128[0] & b_.u128[0]) == 0) { - return 1; - } - return 0; - #else - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - if ((a_.u64[i] & b_.u64[i]) > 0) - return 0; - } - #endif - - return 1; - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_testz_si128 - #define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE4_1_H) */ diff --git a/extern/simde/x86/sse4.2.h b/extern/simde/x86/sse4.2.h deleted file mode 100644 index ae9e7569e..000000000 --- a/extern/simde/x86/sse4.2.h +++ /dev/null @@ -1,381 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_SSE4_2_H) -#define SIMDE_X86_SSE4_2_H - -#include "sse4.1.h" - -#if defined(__ARM_ACLE) || (defined(__GNUC__) && defined(__ARM_FEATURE_CRC32)) - #include -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #define SIMDE_SIDD_UBYTE_OPS _SIDD_UBYTE_OPS - #define SIMDE_SIDD_UWORD_OPS _SIDD_UWORD_OPS - #define SIMDE_SIDD_SBYTE_OPS _SIDD_SBYTE_OPS - #define SIMDE_SIDD_SWORD_OPS _SIDD_SWORD_OPS - #define SIMDE_SIDD_CMP_EQUAL_ANY _SIDD_CMP_EQUAL_ANY - #define SIMDE_SIDD_CMP_RANGES _SIDD_CMP_RANGES - #define SIMDE_SIDD_CMP_EQUAL_EACH _SIDD_CMP_EQUAL_EACH - #define SIMDE_SIDD_CMP_EQUAL_ORDERED _SIDD_CMP_EQUAL_ORDERED - #define SIMDE_SIDD_POSITIVE_POLARITY _SIDD_POSITIVE_POLARITY - #define SIMDE_SIDD_NEGATIVE_POLARITY _SIDD_NEGATIVE_POLARITY - #define SIMDE_SIDD_MASKED_POSITIVE_POLARITY _SIDD_MASKED_POSITIVE_POLARITY - #define SIMDE_SIDD_MASKED_NEGATIVE_POLARITY _SIDD_MASKED_NEGATIVE_POLARITY - #define SIMDE_SIDD_LEAST_SIGNIFICANT _SIDD_LEAST_SIGNIFICANT - #define SIMDE_SIDD_MOST_SIGNIFICANT _SIDD_MOST_SIGNIFICANT - #define SIMDE_SIDD_BIT_MASK _SIDD_BIT_MASK - #define SIMDE_SIDD_UNIT_MASK _SIDD_UNIT_MASK -#else - #define SIMDE_SIDD_UBYTE_OPS 0x00 - #define SIMDE_SIDD_UWORD_OPS 0x01 - #define SIMDE_SIDD_SBYTE_OPS 0x02 - #define SIMDE_SIDD_SWORD_OPS 0x03 - #define SIMDE_SIDD_CMP_EQUAL_ANY 0x00 - #define SIMDE_SIDD_CMP_RANGES 0x04 - #define SIMDE_SIDD_CMP_EQUAL_EACH 0x08 - #define SIMDE_SIDD_CMP_EQUAL_ORDERED 0x0c - #define SIMDE_SIDD_POSITIVE_POLARITY 0x00 - #define SIMDE_SIDD_NEGATIVE_POLARITY 0x10 - #define SIMDE_SIDD_MASKED_POSITIVE_POLARITY 0x20 - #define SIMDE_SIDD_MASKED_NEGATIVE_POLARITY 0x30 - #define SIMDE_SIDD_LEAST_SIGNIFICANT 0x00 - #define SIMDE_SIDD_MOST_SIGNIFICANT 0x40 - #define SIMDE_SIDD_BIT_MASK 0x00 - #define SIMDE_SIDD_UNIT_MASK 0x40 -#endif - -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) && !defined(_SIDD_UBYTE_OPS) - #define _SIDD_UBYTE_OPS SIMDE_SIDD_UBYTE_OPS - #define _SIDD_UWORD_OPS SIMDE_SIDD_UWORD_OPS - #define _SIDD_SBYTE_OPS SIMDE_SIDD_SBYTE_OPS - #define _SIDD_SWORD_OPS SIMDE_SIDD_SWORD_OPS - #define _SIDD_CMP_EQUAL_ANY SIMDE_SIDD_CMP_EQUAL_ANY - #define _SIDD_CMP_RANGES SIMDE_SIDD_CMP_RANGES - #define _SIDD_CMP_EQUAL_EACH SIMDE_SIDD_CMP_EQUAL_EACH - #define _SIDD_CMP_EQUAL_ORDERED SIMDE_SIDD_CMP_EQUAL_ORDERED - #define _SIDD_POSITIVE_POLARITY SIMDE_SIDD_POSITIVE_POLARITY - #define _SIDD_NEGATIVE_POLARITY SIMDE_SIDD_NEGATIVE_POLARITY - #define _SIDD_MASKED_POSITIVE_POLARITY SIMDE_SIDD_MASKED_POSITIVE_POLARITY - #define _SIDD_MASKED_NEGATIVE_POLARITY SIMDE_SIDD_MASKED_NEGATIVE_POLARITY - #define _SIDD_LEAST_SIGNIFICANT SIMDE_SIDD_LEAST_SIGNIFICANT - #define _SIDD_MOST_SIGNIFICANT SIMDE_SIDD_MOST_SIGNIFICANT - #define _SIDD_BIT_MASK SIMDE_SIDD_BIT_MASK - #define _SIDD_UNIT_MASK SIMDE_SIDD_UNIT_MASK -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int simde_mm_cmpestrs (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - #if !defined(HEDLEY_PGI_VERSION) - /* https://www.pgroup.com/userforum/viewtopic.php?f=4&p=27590&sid=cf89f8bf30be801831fe4a2ff0a2fa6c */ - (void) a; - (void) b; - #endif - (void) la; - (void) lb; - return la <= ((128 / ((imm8 & SIMDE_SIDD_UWORD_OPS) ? 16 : 8)) - 1); -} -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_cmpestrs(a, la, b, lb, imm8) \ - _mm_cmpestrs( \ - HEDLEY_REINTERPRET_CAST(__v16qi, a), la, \ - HEDLEY_REINTERPRET_CAST(__v16qi, b), lb, \ - imm8) - #else - #define simde_mm_cmpestrs(a, la, b, lb, imm8) _mm_cmpestrs(a, la, b, lb, imm8) - #endif -#endif -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpestrs - #define _mm_cmpestrs(a, la, b, lb, imm8) simde_mm_cmpestrs(a, la, b, lb, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int simde_mm_cmpestrz (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - #if !defined(HEDLEY_PGI_VERSION) - /* https://www.pgroup.com/userforum/viewtopic.php?f=4&p=27590&sid=cf89f8bf30be801831fe4a2ff0a2fa6c */ - (void) a; - (void) b; - #endif - (void) la; - (void) lb; - return lb <= ((128 / ((imm8 & SIMDE_SIDD_UWORD_OPS) ? 16 : 8)) - 1); -} -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_cmpestrz(a, la, b, lb, imm8) \ - _mm_cmpestrz( \ - HEDLEY_REINTERPRET_CAST(__v16qi, a), la, \ - HEDLEY_REINTERPRET_CAST(__v16qi, b), lb, \ - imm8) - #else - #define simde_mm_cmpestrz(a, la, b, lb, imm8) _mm_cmpestrz(a, la, b, lb, imm8) - #endif -#endif -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpestrz - #define _mm_cmpestrz(a, la, b, lb, imm8) simde_mm_cmpestrz(a, la, b, lb, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpgt_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) - return _mm_cmpgt_epi64(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* https://stackoverflow.com/a/65175746/501126 */ - __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a, b), _mm_sub_epi64(b, a)); - r = _mm_or_si128(r, _mm_cmpgt_epi32(a, b)); - return _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgtq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* https://stackoverflow.com/a/65223269/501126 */ - r_.neon_i64 = vshrq_n_s64(vqsubq_s64(b_.neon_i64, a_.neon_i64), 63); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_cmpgt(a_.altivec_i64, b_.altivec_i64)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpgt_epi64 - #define _mm_cmpgt_epi64(a, b) simde_mm_cmpgt_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_cmpistrs_8_(simde__m128i a) { - simde__m128i_private a_= simde__m128i_to_private(a); - const int upper_bound = (128 / 8) - 1; - int a_invalid = 0; - SIMDE_VECTORIZE - for (int i = 0 ; i <= upper_bound ; i++) { - if(!a_.i8[i]) - a_invalid = 1; - } - return a_invalid; -} - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_cmpistrs_16_(simde__m128i a) { - simde__m128i_private a_= simde__m128i_to_private(a); - const int upper_bound = (128 / 16) - 1; - int a_invalid = 0; - SIMDE_VECTORIZE - for (int i = 0 ; i <= upper_bound ; i++) { - if(!a_.i16[i]) - a_invalid = 1; - } - return a_invalid; -} - -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_cmpistrs(a, b, imm8) \ - _mm_cmpistrs( \ - HEDLEY_REINTERPRET_CAST(__v16qi, a), \ - HEDLEY_REINTERPRET_CAST(__v16qi, b), \ - imm8) - #else - #define simde_mm_cmpistrs(a, b, imm8) _mm_cmpistrs(a, b, imm8) - #endif -#else - #define simde_mm_cmpistrs(a, b, imm8) \ - (((imm8) & SIMDE_SIDD_UWORD_OPS) \ - ? simde_mm_cmpistrs_16_((a)) \ - : simde_mm_cmpistrs_8_((a))) -#endif -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpistrs - #define _mm_cmpistrs(a, b, imm8) simde_mm_cmpistrs(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_cmpistrz_8_(simde__m128i b) { - simde__m128i_private b_= simde__m128i_to_private(b); - const int upper_bound = (128 / 8) - 1; - int b_invalid = 0; - SIMDE_VECTORIZE - for (int i = 0 ; i <= upper_bound ; i++) { - if(!b_.i8[i]) - b_invalid = 1; - } - return b_invalid; -} - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_cmpistrz_16_(simde__m128i b) { - simde__m128i_private b_= simde__m128i_to_private(b); - const int upper_bound = (128 / 16) - 1; - int b_invalid = 0; - SIMDE_VECTORIZE - for (int i = 0 ; i <= upper_bound ; i++) { - if(!b_.i16[i]) - b_invalid = 1; - } - return b_invalid; -} - -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_cmpistrz(a, b, imm8) \ - _mm_cmpistrz( \ - HEDLEY_REINTERPRET_CAST(__v16qi, a), \ - HEDLEY_REINTERPRET_CAST(__v16qi, b), \ - imm8) - #else - #define simde_mm_cmpistrz(a, b, imm8) _mm_cmpistrz(a, b, imm8) - #endif -#else - #define simde_mm_cmpistrz(a, b, imm8) \ - (((imm8) & SIMDE_SIDD_UWORD_OPS) \ - ? simde_mm_cmpistrz_16_((b)) \ - : simde_mm_cmpistrz_8_((b))) -#endif -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpistrz - #define _mm_cmpistrz(a, b, imm8) simde_mm_cmpistrz(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm_crc32_u8(uint32_t prevcrc, uint8_t v) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) - return _mm_crc32_u8(prevcrc, v); - #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) - return __crc32cb(prevcrc, v); - #else - uint32_t crc = prevcrc; - crc ^= v; - for(int bit = 0 ; bit < 8 ; bit++) { - if (crc & 1) - crc = (crc >> 1) ^ UINT32_C(0x82f63b78); - else - crc = (crc >> 1); - } - return crc; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #define _mm_crc32_u8(prevcrc, v) simde_mm_crc32_u8(prevcrc, v) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm_crc32_u16(uint32_t prevcrc, uint16_t v) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) - return _mm_crc32_u16(prevcrc, v); - #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) - return __crc32ch(prevcrc, v); - #else - uint32_t crc = prevcrc; - crc = simde_mm_crc32_u8(crc, v & 0xff); - crc = simde_mm_crc32_u8(crc, (v >> 8) & 0xff); - return crc; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #define _mm_crc32_u16(prevcrc, v) simde_mm_crc32_u16(prevcrc, v) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm_crc32_u32(uint32_t prevcrc, uint32_t v) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) - return _mm_crc32_u32(prevcrc, v); - #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) - return __crc32cw(prevcrc, v); - #else - uint32_t crc = prevcrc; - crc = simde_mm_crc32_u16(crc, v & 0xffff); - crc = simde_mm_crc32_u16(crc, (v >> 16) & 0xffff); - return crc; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #define _mm_crc32_u32(prevcrc, v) simde_mm_crc32_u32(prevcrc, v) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint64_t -simde_mm_crc32_u64(uint64_t prevcrc, uint64_t v) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) && defined(SIMDE_ARCH_AMD64) - return _mm_crc32_u64(prevcrc, v); - #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) - return __crc32cd(HEDLEY_STATIC_CAST(uint32_t, prevcrc), v); - #else - uint64_t crc = prevcrc; - crc = simde_mm_crc32_u32(HEDLEY_STATIC_CAST(uint32_t, crc), v & 0xffffffff); - crc = simde_mm_crc32_u32(HEDLEY_STATIC_CAST(uint32_t, crc), (v >> 32) & 0xffffffff); - return crc; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_crc32_u64(prevcrc, v) simde_mm_crc32_u64(prevcrc, v) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE4_2_H) */ diff --git a/extern/simde/x86/ssse3.h b/extern/simde/x86/ssse3.h deleted file mode 100644 index 6c4c12d5f..000000000 --- a/extern/simde/x86/ssse3.h +++ /dev/null @@ -1,1057 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_SSSE3_H) -#define SIMDE_X86_SSSE3_H - -#include "sse3.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_abs_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_abs_epi8(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vabsq_s8(a_.neon_i8); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_abs(a_.altivec_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.i8[i] < 0) ? (- a_.i8[i]) : a_.i8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_epi8(a) simde_mm_abs_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_abs_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_abs_epi16(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vabsq_s16(a_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_abs(a_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.i16[i] < 0) ? (- a_.i16[i]) : a_.i16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_epi16(a) simde_mm_abs_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_abs_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_abs_epi32(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i m = _mm_cmpgt_epi32(_mm_setzero_si128(), a); - return _mm_sub_epi32(_mm_xor_si128(a, m), m); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vabsq_s32(a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_abs(a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - #if defined(_MSC_VER) - HEDLEY_DIAGNOSTIC_PUSH - #pragma warning(disable:4146) - #endif - r_.u32[i] = (a_.i32[i] < 0) ? (- HEDLEY_STATIC_CAST(uint32_t, a_.i32[i])) : HEDLEY_STATIC_CAST(uint32_t, a_.i32[i]); - #if defined(_MSC_VER) - HEDLEY_DIAGNOSTIC_POP - #endif - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_epi32(a) simde_mm_abs_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_abs_pi8 (simde__m64 a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_abs_pi8(a); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vabs_s8(a_.neon_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.i8[i] < 0) ? (- a_.i8[i]) : a_.i8[i]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_pi8(a) simde_mm_abs_pi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_abs_pi16 (simde__m64 a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_abs_pi16(a); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vabs_s16(a_.neon_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.i16[i] < 0) ? (- a_.i16[i]) : a_.i16[i]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_pi16(a) simde_mm_abs_pi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_abs_pi32 (simde__m64 a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_abs_pi32(a); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vabs_s32(a_.neon_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.i32[i] < 0) ? (- a_.i32[i]) : a_.i32[i]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_pi32(a) simde_mm_abs_pi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_alignr_epi8 (simde__m128i a, simde__m128i b, int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - if (HEDLEY_UNLIKELY(count > 31)) - return simde_mm_setzero_si128(); - - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - const int srcpos = count + HEDLEY_STATIC_CAST(int, i); - if (srcpos > 31) { - r_.i8[i] = 0; - } else if (srcpos > 15) { - r_.i8[i] = a_.i8[(srcpos) & 15]; - } else { - r_.i8[i] = b_.i8[srcpos]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSSE3_NATIVE) - #define simde_mm_alignr_epi8(a, b, count) _mm_alignr_epi8(a, b, count) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_alignr_epi8(a, b, count) \ - ( \ - ((count) > 31) \ - ? simde__m128i_from_neon_i8(vdupq_n_s8(0)) \ - : ( \ - ((count) > 15) \ - ? (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(a), vdupq_n_s8(0), (count) & 15))) \ - : (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(b), simde__m128i_to_neon_i8(a), ((count) & 15)))))) -#endif -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) - #define _mm_alignr_epi8(a, b, count) simde_mm_alignr_epi8(a, b, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_alignr_pi8 (simde__m64 a, simde__m64 b, const int count) - SIMDE_REQUIRE_CONSTANT(count) { - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - if (HEDLEY_UNLIKELY(count > 15)) - return simde_mm_setzero_si64(); - - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - const int srcpos = count + HEDLEY_STATIC_CAST(int, i); - if (srcpos > 15) { - r_.i8[i] = 0; - } else if (srcpos > 7) { - r_.i8[i] = a_.i8[(srcpos) & 7]; - } else { - r_.i8[i] = b_.i8[srcpos]; - } - } - - return simde__m64_from_private(r_); -} -#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) -# define simde_mm_alignr_pi8(a, b, count) _mm_alignr_pi8(a, b, count) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_alignr_pi8(a, b, count) \ - ( \ - ((count) > 15) \ - ? simde__m64_from_neon_i8(vdup_n_s8(0)) \ - : ( \ - ((count) > 7) \ - ? (simde__m64_from_neon_i8(vext_s8(simde__m64_to_neon_i8(a), vdup_n_s8(0), (count) & 7))) \ - : (simde__m64_from_neon_i8(vext_s8(simde__m64_to_neon_i8(b), simde__m64_to_neon_i8(a), ((count) & 7)))))) -#endif -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_alignr_pi8(a, b, count) simde_mm_alignr_pi8(a, b, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shuffle_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_shuffle_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vqtbl1q_s8(a_.neon_i8, vandq_u8(b_.neon_u8, vdupq_n_u8(0x8F))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Mask out the bits we're not interested in. vtbl will result in 0 - * for any values outside of [0, 15], so if the high bit is set it - * will return 0, just like in SSSE3. */ - b_.neon_i8 = vandq_s8(b_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, (1 << 7) | 15))); - - /* Convert a from an int8x16_t to an int8x8x2_t */ - int8x8x2_t i; - i.val[0] = vget_low_s8(a_.neon_i8); - i.val[1] = vget_high_s8(a_.neon_i8); - - /* Table lookups */ - int8x8_t l = vtbl2_s8(i, vget_low_s8(b_.neon_i8)); - int8x8_t h = vtbl2_s8(i, vget_high_s8(b_.neon_i8)); - - r_.neon_i8 = vcombine_s8(l, h); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - /* This is a bit ugly because of the casts and the awful type - * macros (SIMDE_POWER_ALTIVEC_VECTOR), but it's really just - * vec_sel(vec_perm(a, a, b), 0, vec_cmplt(b, 0)) */ - SIMDE_POWER_ALTIVEC_VECTOR(signed char) z = { 0, }; - SIMDE_POWER_ALTIVEC_VECTOR(signed char) msb_mask = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmplt(b_.altivec_i8, z)); - SIMDE_POWER_ALTIVEC_VECTOR(signed char) c = vec_perm(a_.altivec_i8, a_.altivec_i8, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), b_.altivec_i8)); - r_.altivec_i8 = vec_sel(c, z, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), msb_mask)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_swizzle( - a_.wasm_v128, wasm_v128_and(b_.wasm_v128, wasm_i8x16_splat(0x8F))); - #else - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[b_.i8[i] & 15] & (~(b_.i8[i]) >> 7); - } - #endif - - return simde__m128i_from_private(r_); -#endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_shuffle_epi8(a, b) simde_mm_shuffle_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_shuffle_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_shuffle_pi8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - b_.neon_i8 = vand_s8(b_.neon_i8, vdup_n_s8(HEDLEY_STATIC_CAST(int8_t, (1 << 7) | 7))); - r_.neon_i8 = vtbl1_s8(a_.neon_i8, b_.neon_i8); - #else - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.i8[i] = a_.i8[b_.i8[i] & 7] & (~(b_.i8[i]) >> 7); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_shuffle_pi8(a, b) simde_mm_shuffle_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hadd_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hadd_epi16(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128i_from_neon_i16(vpaddq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b)); - return simde__m128i_from_neon_i16(vaddq_s16(t.val[0], t.val[1])); - #else - return simde_mm_add_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_epi16(a, b) simde_mm_hadd_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hadd_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hadd_epi32(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128i_from_neon_i32(vpaddq_s32(simde__m128i_to_neon_i32(a), simde__m128i_to_neon_i32(b))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4x2_t t = vuzpq_s32(simde__m128i_to_neon_i32(a), simde__m128i_to_neon_i32(b)); - return simde__m128i_from_neon_i32(vaddq_s32(t.val[0], t.val[1])); - #else - return simde_mm_add_epi32(simde_x_mm_deinterleaveeven_epi32(a, b), simde_x_mm_deinterleaveodd_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_epi32(a, b) simde_mm_hadd_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hadd_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hadd_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vpadd_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vadd_s16(t.val[0], t.val[1]); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = - SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 2, 4, 6) + - SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 1, 3, 5, 7); - #else - r_.i16[0] = a_.i16[0] + a_.i16[1]; - r_.i16[1] = a_.i16[2] + a_.i16[3]; - r_.i16[2] = b_.i16[0] + b_.i16[1]; - r_.i16[3] = b_.i16[2] + b_.i16[3]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_pi16(a, b) simde_mm_hadd_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hadd_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hadd_pi32(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vpadd_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x2x2_t t = vuzp_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i32 = vadd_s32(t.val[0], t.val[1]); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = - SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2) + - SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3); - #else - r_.i32[0] = a_.i32[0] + a_.i32[1]; - r_.i32[1] = b_.i32[0] + b_.i32[1]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_pi32(a, b) simde_mm_hadd_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hadds_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hadds_epi16(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b)); - return simde__m128i_from_neon_i16(vqaddq_s16(t.val[0], t.val[1])); - #else - return simde_mm_adds_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadds_epi16(a, b) simde_mm_hadds_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hadds_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hadds_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vqadd_s16(t.val[0], t.val[1]); - #else - for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) { - int32_t ta = HEDLEY_STATIC_CAST(int32_t, a_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) + 1]); - r_.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN; - int32_t tb = HEDLEY_STATIC_CAST(int32_t, b_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, b_.i16[(i * 2) + 1]); - r_.i16[i + 2] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadds_pi16(a, b) simde_mm_hadds_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hsub_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hsub_epi16(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b)); - return simde__m128i_from_neon_i16(vsubq_s16(t.val[0], t.val[1])); - #else - return simde_mm_sub_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_epi16(a, b) simde_mm_hsub_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hsub_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hsub_epi32(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4x2_t t = vuzpq_s32(simde__m128i_to_neon_i32(a), simde__m128i_to_neon_i32(b)); - return simde__m128i_from_neon_i32(vsubq_s32(t.val[0], t.val[1])); - #else - return simde_mm_sub_epi32(simde_x_mm_deinterleaveeven_epi32(a, b), simde_x_mm_deinterleaveodd_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_epi32(a, b) simde_mm_hsub_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hsub_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hsub_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vsub_s16(t.val[0], t.val[1]); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = - SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 2, 4, 6) - - SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 1, 3, 5, 7); - #else - r_.i16[0] = a_.i16[0] - a_.i16[1]; - r_.i16[1] = a_.i16[2] - a_.i16[3]; - r_.i16[2] = b_.i16[0] - b_.i16[1]; - r_.i16[3] = b_.i16[2] - b_.i16[3]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_pi16(a, b) simde_mm_hsub_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hsub_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hsub_pi32(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x2x2_t t = vuzp_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i32 = vsub_s32(t.val[0], t.val[1]); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = - SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2) - - SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3); - #else - r_.i32[0] = a_.i32[0] - a_.i32[1]; - r_.i32[1] = b_.i32[0] - b_.i32[1]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_pi32(a, b) simde_mm_hsub_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hsubs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hsubs_epi16(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b)); - return simde__m128i_from_neon_i16(vqsubq_s16(t.val[0], t.val[1])); - #else - return simde_mm_subs_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsubs_epi16(a, b) simde_mm_hsubs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hsubs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hsubs_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vqsub_s16(t.val[0], t.val[1]); - #else - for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) { - r_.i16[ i ] = simde_math_subs_i16(a_.i16[i * 2], a_.i16[(i * 2) + 1]); - r_.i16[i + 2] = simde_math_subs_i16(b_.i16[i * 2], b_.i16[(i * 2) + 1]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsubs_pi16(a, b) simde_mm_hsubs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maddubs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_maddubs_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Zero extend a */ - int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a_.neon_u16, 8)); - int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a_.neon_u16, vdupq_n_u16(0xff00))); - - /* Sign extend by shifting left then shifting right. */ - int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b_.neon_i16, 8), 8); - int16x8_t b_odd = vshrq_n_s16(b_.neon_i16, 8); - - /* multiply */ - int16x8_t prod1 = vmulq_s16(a_even, b_even); - int16x8_t prod2 = vmulq_s16(a_odd, b_odd); - - /* saturated add */ - r_.neon_i16 = vqaddq_s16(prod1, prod2); - #else - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const int idx = HEDLEY_STATIC_CAST(int, i) << 1; - int32_t ts = - (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) + - (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1])); - r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_maddubs_epi16(a, b) simde_mm_maddubs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_maddubs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_maddubs_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int16x8_t ai = vreinterpretq_s16_u16(vmovl_u8(a_.neon_u8)); - int16x8_t bi = vmovl_s8(b_.neon_i8); - int16x8_t p = vmulq_s16(ai, bi); - int16x4_t l = vget_low_s16(p); - int16x4_t h = vget_high_s16(p); - r_.neon_i16 = vqadd_s16(vuzp1_s16(l, h), vuzp2_s16(l, h)); - #else - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const int idx = HEDLEY_STATIC_CAST(int, i) << 1; - int32_t ts = - (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) + - (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1])); - r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_maddubs_pi16(a, b) simde_mm_maddubs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mulhrs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_mulhrs_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Multiply */ - int32x4_t mul_lo = vmull_s16(vget_low_s16(a_.neon_i16), - vget_low_s16(b_.neon_i16)); - int32x4_t mul_hi = vmull_s16(vget_high_s16(a_.neon_i16), - vget_high_s16(b_.neon_i16)); - - /* Rounding narrowing shift right - * narrow = (int16_t)((mul + 16384) >> 15); */ - int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); - int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); - - /* Join together */ - r_.neon_i16 = vcombine_s16(narrow_lo, narrow_hi); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t __lo = wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(a_.wasm_v128), wasm_i32x4_extend_low_i16x8(b_.wasm_v128)); - v128_t __hi = wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(a_.wasm_v128), wasm_i32x4_extend_high_i16x8(b_.wasm_v128)); - const v128_t __inc = wasm_i32x4_splat(0x4000); - __lo = wasm_i32x4_add(__lo, __inc); - __hi = wasm_i32x4_add(__hi, __inc); - __lo = wasm_i32x4_add(__lo, __lo); - __hi = wasm_i32x4_add(__hi, __hi); - r_.wasm_v128 = wasm_i16x8_shuffle(__lo, __hi, 1, 3, 5, 7, 9, 11, 13, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_mulhrs_epi16(a, b) simde_mm_mulhrs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mulhrs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_mulhrs_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Multiply */ - int32x4_t mul = vmull_s16(a_.neon_i16, b_.neon_i16); - - /* Rounding narrowing shift right - * narrow = (int16_t)((mul + 16384) >> 15); */ - int16x4_t narrow = vrshrn_n_s32(mul, 15); - - /* Join together */ - r_.neon_i16 = narrow; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_mulhrs_pi16(a, b) simde_mm_mulhrs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sign_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_sign_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x16_t aneg_mask = vreinterpretq_u8_s8(vshrq_n_s8(b_.neon_i8, 7)); - uint8x16_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqzq_s8(b_.neon_i8); - #else - bnz_mask = vceqq_s8(b_.neon_i8, vdupq_n_s8(0)); - #endif - bnz_mask = vmvnq_u8(bnz_mask); - - r_.neon_i8 = vbslq_s8(aneg_mask, vnegq_s8(a_.neon_i8), vandq_s8(a_.neon_i8, vreinterpretq_s8_u8(bnz_mask))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - simde__m128i mask = wasm_i8x16_shr(b_.wasm_v128, 7); - simde__m128i zeromask = simde_mm_cmpeq_epi8(b_.wasm_v128, simde_mm_setzero_si128()); - r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi8(a_.wasm_v128, mask), mask)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_epi8(a, b) simde_mm_sign_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sign_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_sign_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint16x8_t aneg_mask = vreinterpretq_u16_s16(vshrq_n_s16(b_.neon_i16, 15)); - uint16x8_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqzq_s16(b_.neon_i16); - #else - bnz_mask = vceqq_s16(b_.neon_i16, vdupq_n_s16(0)); - #endif - bnz_mask = vmvnq_u16(bnz_mask); - - r_.neon_i16 = vbslq_s16(aneg_mask, vnegq_s16(a_.neon_i16), vandq_s16(a_.neon_i16, vreinterpretq_s16_u16(bnz_mask))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - simde__m128i mask = simde_mm_srai_epi16(b_.wasm_v128, 15); - simde__m128i zeromask = simde_mm_cmpeq_epi16(b_.wasm_v128, simde_mm_setzero_si128()); - r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi16(a_.wasm_v128, mask), mask)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] != 0) ? (a_.i16[i]) : INT16_C(0)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_epi16(a, b) simde_mm_sign_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sign_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_sign_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t aneg_mask = vreinterpretq_u32_s32(vshrq_n_s32(b_.neon_i32, 31)); - uint32x4_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqzq_s32(b_.neon_i32); - #else - bnz_mask = vceqq_s32(b_.neon_i32, vdupq_n_s32(0)); - #endif - bnz_mask = vmvnq_u32(bnz_mask); - - r_.neon_i32 = vbslq_s32(aneg_mask, vnegq_s32(a_.neon_i32), vandq_s32(a_.neon_i32, vreinterpretq_s32_u32(bnz_mask))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - simde__m128i mask = simde_mm_srai_epi32(b_.wasm_v128, 31); - simde__m128i zeromask = simde_mm_cmpeq_epi32(b_.wasm_v128, simde_mm_setzero_si128()); - r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi32(a_.wasm_v128, mask), mask)); - #else - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] != 0) ? (a_.i32[i]) : INT32_C(0)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_epi32(a, b) simde_mm_sign_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sign_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sign_pi8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x8_t aneg_mask = vreinterpret_u8_s8(vshr_n_s8(b_.neon_i8, 7)); - uint8x8_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqz_s8(b_.neon_i8); - #else - bnz_mask = vceq_s8(b_.neon_i8, vdup_n_s8(0)); - #endif - bnz_mask = vmvn_u8(bnz_mask); - - r_.neon_i8 = vbsl_s8(aneg_mask, vneg_s8(a_.neon_i8), vand_s8(a_.neon_i8, vreinterpret_s8_u8(bnz_mask))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_pi8(a, b) simde_mm_sign_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sign_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sign_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint16x4_t aneg_mask = vreinterpret_u16_s16(vshr_n_s16(b_.neon_i16, 15)); - uint16x4_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqz_s16(b_.neon_i16); - #else - bnz_mask = vceq_s16(b_.neon_i16, vdup_n_s16(0)); - #endif - bnz_mask = vmvn_u16(bnz_mask); - - r_.neon_i16 = vbsl_s16(aneg_mask, vneg_s16(a_.neon_i16), vand_s16(a_.neon_i16, vreinterpret_s16_u16(bnz_mask))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] > 0) ? (a_.i16[i]) : INT16_C(0)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_pi16(a, b) simde_mm_sign_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sign_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sign_pi32(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x2_t aneg_mask = vreinterpret_u32_s32(vshr_n_s32(b_.neon_i32, 31)); - uint32x2_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqz_s32(b_.neon_i32); - #else - bnz_mask = vceq_s32(b_.neon_i32, vdup_n_s32(0)); - #endif - bnz_mask = vmvn_u32(bnz_mask); - - r_.neon_i32 = vbsl_s32(aneg_mask, vneg_s32(a_.neon_i32), vand_s32(a_.neon_i32, vreinterpret_s32_u32(bnz_mask))); - #else - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] > 0) ? (a_.i32[i]) : INT32_C(0)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_pi32(a, b) simde_mm_sign_pi32(a, b) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE2_H) */ diff --git a/samples/data/background.fs b/samples/data/background.fs index 247fadcf0..bd0a8e414 100644 --- a/samples/data/background.fs +++ b/samples/data/background.fs @@ -23,7 +23,7 @@ void main() float noise = random(uv + time * 0.1); // Adjust these values to control the intensity and color of the grain - float grainIntensity = 0.04; + float grainIntensity = 0.03; // Mix the base color with the noise vec3 color = baseColor + vec3(noise * grainIntensity); diff --git a/samples/main.cpp b/samples/main.cpp index 576b4d1ca..85b9a441a 100644 --- a/samples/main.cpp +++ b/samples/main.cpp @@ -142,8 +142,6 @@ static void CreateUI( GLFWwindow* window, const char* glslVersion ) assert( false ); } - // this doesn't look that good - // Search for font file const char* fontPath = "samples/data/droid_sans.ttf"; FILE* file = fopen( fontPath, "rb" ); diff --git a/samples/sample_stacking.cpp b/samples/sample_stacking.cpp index 84115e38b..543b99b90 100644 --- a/samples/sample_stacking.cpp +++ b/samples/sample_stacking.cpp @@ -409,7 +409,7 @@ class CircleStack : public Sample b2World_SetContactTuning( m_worldId, 0.25f * 360.0f, 10.0f, 3.0f ); b2Circle circle = {}; - circle.radius = 0.1f; + circle.radius = 0.5f; b2ShapeDef shapeDef = b2DefaultShapeDef(); b2BodyDef bodyDef = b2DefaultBodyDef(); @@ -417,7 +417,7 @@ class CircleStack : public Sample float y = 0.5f; - for ( int i = 0; i < 20; ++i ) + for ( int i = 0; i < 8; ++i ) { bodyDef.position.y = y; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5078d1489..23437f04d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,9 +88,6 @@ target_include_directories(box2d ${CMAKE_CURRENT_SOURCE_DIR} ) -# SIMDE is used to support SIMD math on multiple platforms -target_link_libraries(box2d PRIVATE simde) - # Box2D uses C17 set_target_properties(box2d PROPERTIES C_STANDARD 17 @@ -119,6 +116,10 @@ if (BOX2D_VALIDATE) target_compile_definitions(box2d PRIVATE BOX2D_VALIDATE) endif() +if (BOX2D_ENABLE_SIMD) + target_compile_definitions(box2d PRIVATE BOX2D_ENABLE_SIMD) +endif() + if (MSVC) message(STATUS "Box2D on MSVC") if (BUILD_SHARED_LIBS) @@ -137,6 +138,7 @@ if (MSVC) if (BOX2D_AVX2) message(STATUS "Box2D using AVX2") + target_compile_definitions(box2d PRIVATE BOX2D_AVX2) target_compile_options(box2d PRIVATE /arch:AVX2) endif() @@ -144,8 +146,10 @@ elseif (MINGW) message(STATUS "Box2D on MinGW") if (BOX2D_AVX2) message(STATUS "Box2D using AVX2") + target_compile_definitions(box2d PRIVATE BOX2D_AVX2) target_compile_options(box2d PRIVATE -mavx2) else() + # todo should no longer be needed # see SIMDE_DIAGNOSTIC_DISABLE_PSABI_ message(STATUS "Box2D disabling ABI warning") target_compile_options(box2d PRIVATE -Wno-psabi) @@ -154,6 +158,7 @@ elseif (APPLE) message(STATUS "Box2D on Apple") elseif (EMSCRIPTEN) message(STATUS "Box2D on Emscripten") + # todo should no longer be needed # see SIMDE_DIAGNOSTIC_DISABLE_PSABI_ message(STATUS "Box2D disabling ABI warning") target_compile_options(box2d PRIVATE -Wno-psabi) @@ -168,8 +173,10 @@ elseif (UNIX) if (BOX2D_AVX2) message(STATUS "Box2D using AVX2") # FMA -mfma -mavx -march=native + target_compile_definitions(box2d PRIVATE BOX2D_AVX2) target_compile_options(box2d PRIVATE -mavx2) else() + # todo should no longer be needed # see SIMDE_DIAGNOSTIC_DISABLE_PSABI_ message(STATUS "Box2D disabling ABI warning") target_compile_options(box2d PRIVATE -Wno-psabi) diff --git a/src/contact_solver.c b/src/contact_solver.c index b19050116..55b6918c5 100644 --- a/src/contact_solver.c +++ b/src/contact_solver.c @@ -9,14 +9,8 @@ #include "core.h" #include "solver_set.h" #include "world.h" -#include "x86/avx2.h" -#include "x86/fma.h" -// Soft contact constraints with sub-stepping support -// http://mmacklin.com/smallsteps.pdf -// https://box2d.org/files/ErinCatto_SoftConstraints_GDC2011.pdf - -// Uses fixed anchors for Jacobians for better behavior on rolling shapes (circles & capsules) +#include void b2PrepareOverflowContacts( b2StepContext* context ) { @@ -258,8 +252,7 @@ void b2SolveOverflowContacts( b2StepContext* context, bool useBias ) b2ContactConstraintPoint* cp = constraint->points + j; // compute current separation - // todo this is subject to round-off error if the anchor is far from the body center of mass - // todo for example a large world with a single static body and many offset shapes + // this is subject to round-off error if the anchor is far from the body center of mass b2Vec2 ds = b2Add( dp, b2Sub( b2RotateVector( dqB, cp->anchorB ), b2RotateVector( dqA, cp->anchorA ) ) ); float s = b2Dot( ds, normal ) + cp->baseSeparation; @@ -470,31 +463,456 @@ void b2StoreOverflowImpulses( b2StepContext* context ) b2TracyCZoneEnd( store_impulses ); } -// SIMD WIP -#define add( a, b ) simde_mm256_add_ps( ( a ), ( b ) ) -#define sub( a, b ) simde_mm256_sub_ps( ( a ), ( b ) ) -#define mul( a, b ) simde_mm256_mul_ps( ( a ), ( b ) ) +#if defined( B2_SIMD_AVX2 ) + +#include + +// wide float holds 8 numbers +typedef __m256 b2FloatW; + +#elif defined( B2_SIMD_NEON ) -// todo SIMDE implementation of simde_mm256_fnmadd_ps is slow if FMA is not available -// #define muladd(a, b, c) simde_mm256_fmadd_ps(b, c, a) -// #define mulsub(a, b, c) simde_mm256_fnmadd_ps(b, c, a) +#include + +// wide float holds 4 numbers +typedef float32x4_t b2FloatW; + +#elif defined( B2_SIMD_SSE2 ) + +#include + +// wide float holds 4 numbers +typedef __m128 b2FloatW; + +#else + +// scalar math +typedef struct b2FloatW +{ + float x, y, z, w; +} b2FloatW; -#define muladd( a, b, c ) simde_mm256_add_ps( ( a ), simde_mm256_mul_ps( ( b ), ( c ) ) ) -#define mulsub( a, b, c ) simde_mm256_sub_ps( ( a ), simde_mm256_mul_ps( ( b ), ( c ) ) ) +#endif + +// Wide vec2 +typedef struct b2Vec2W +{ + b2FloatW X, Y; +} b2Vec2W; + +// Wide rotation +typedef struct b2RotW +{ + b2FloatW C, S; +} b2RotW; + +#if defined( B2_SIMD_AVX2 ) + +static inline b2FloatW b2ZeroW() +{ + return _mm256_setzero_ps(); +} + +static inline b2FloatW b2SplatW( float scalar ) +{ + return _mm256_set1_ps( scalar ); +} + +static inline b2FloatW b2AddW( b2FloatW a, b2FloatW b ) +{ + return _mm256_add_ps( a, b ); +} + +static inline b2FloatW b2SubW( b2FloatW a, b2FloatW b ) +{ + return _mm256_sub_ps( a, b ); +} + +static inline b2FloatW b2MulW( b2FloatW a, b2FloatW b ) +{ + return _mm256_mul_ps( a, b ); +} + +static inline b2FloatW b2MulAddW( b2FloatW a, b2FloatW b, b2FloatW c ) +{ + return _mm256_add_ps( a, _mm256_mul_ps( b, c ) ); +} + +static inline b2FloatW b2MulSubW( b2FloatW a, b2FloatW b, b2FloatW c ) +{ + return _mm256_sub_ps( a, _mm256_mul_ps( b, c ) ); +} + +static inline b2FloatW b2MinW( b2FloatW a, b2FloatW b ) +{ + return _mm256_min_ps( a, b ); +} + +static inline b2FloatW b2MaxW( b2FloatW a, b2FloatW b ) +{ + return _mm256_max_ps( a, b ); +} + +static inline b2FloatW b2OrW( b2FloatW a, b2FloatW b ) +{ + return _mm256_or_ps( a, b ); +} + +static inline b2FloatW b2GreaterThanW( b2FloatW a, b2FloatW b ) +{ + return _mm256_cmp_ps( a, b, _CMP_GT_OQ ); +} + +static inline b2FloatW b2EqualsW( b2FloatW a, b2FloatW b ) +{ + return _mm256_cmp_ps( a, b, _CMP_EQ_OQ ); +} + +// component-wise returns mask ? b : a +static inline b2FloatW b2BlendW( b2FloatW a, b2FloatW b, b2FloatW mask ) +{ + return _mm256_blendv_ps( a, b, mask ); +} + +#elif defined( B2_SIMD_NEON ) + +static inline b2FloatW b2ZeroW() +{ + return vdupq_n_f32( 0.0f ); +} + +static inline b2FloatW b2SplatW( float scalar ) +{ + return vdupq_n_f32( scalar ); +} + +static inline b2FloatW b2SetW( float a, float b, float c, float d ) +{ + float32_t array[4] = { a, b, c, d }; + return vld1q_f32( array ); +} + +static inline b2FloatW b2AddW( b2FloatW a, b2FloatW b ) +{ + return vaddq_f32( a, b ); +} + +static inline b2FloatW b2SubW( b2FloatW a, b2FloatW b ) +{ + return vsubq_f32( a, b ); +} + +static inline b2FloatW b2MulW( b2FloatW a, b2FloatW b ) +{ + return vmulq_f32( a, b ); +} + +static inline b2FloatW b2MulAddW( b2FloatW a, b2FloatW b, b2FloatW c ) +{ + return vmlaq_f32( a, b, c ); +} + +static inline b2FloatW b2MulSubW( b2FloatW a, b2FloatW b, b2FloatW c ) +{ + return vmlsq_f32( a, b, c ); +} + +static inline b2FloatW b2MinW( b2FloatW a, b2FloatW b ) +{ + return vminq_f32( a, b ); +} + +static inline b2FloatW b2MaxW( b2FloatW a, b2FloatW b ) +{ + return vmaxq_f32( a, b ); +} + +static inline b2FloatW b2OrW( b2FloatW a, b2FloatW b ) +{ + return vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32( a ), vreinterpretq_u32_f32( b ) ) ); +} + +static inline b2FloatW b2GreaterThanW( b2FloatW a, b2FloatW b ) +{ + return vreinterpretq_f32_u32( vcgtq_f32( a, b ) ); +} + +static inline b2FloatW b2EqualsW( b2FloatW a, b2FloatW b ) +{ + return vreinterpretq_f32_u32( vceqq_f32( a, b ) ); +} + +// component-wise returns mask ? b : a +static inline b2FloatW b2BlendW( b2FloatW a, b2FloatW b, b2FloatW mask ) +{ + uint32x4_t mask32 = vreinterpretq_u32_f32( mask ); + return vbslq_f32( mask32, b, a ); +} + +static inline b2FloatW b2LoadW( const float32_t* data ) +{ + return vld1q_f32( data ); +} + +static inline void b2StoreW( float32_t* data, b2FloatW a ) +{ + return vst1q_f32( data, a ); +} + +static inline b2FloatW b2UnpackLoW( b2FloatW a, b2FloatW b ) +{ + return vzip1q_f32( a, b ); +} + +static inline b2FloatW b2UnpackHiW( b2FloatW a, b2FloatW b ) +{ + return vzip2q_f32( a, b ); +} + +#elif defined( B2_SIMD_SSE2 ) + +static inline b2FloatW b2ZeroW() +{ + return _mm_setzero_ps(); +} + +static inline b2FloatW b2SplatW( float scalar ) +{ + return _mm_set1_ps( scalar ); +} + +static inline b2FloatW b2SetW( float a, float b, float c, float d ) +{ + return _mm_setr_ps( a, b, c, d ); +} + +static inline b2FloatW b2AddW( b2FloatW a, b2FloatW b ) +{ + return _mm_add_ps( a, b ); +} + +static inline b2FloatW b2SubW( b2FloatW a, b2FloatW b ) +{ + return _mm_sub_ps( a, b ); +} + +static inline b2FloatW b2MulW( b2FloatW a, b2FloatW b ) +{ + return _mm_mul_ps( a, b ); +} + +static inline b2FloatW b2MulAddW( b2FloatW a, b2FloatW b, b2FloatW c ) +{ + return _mm_add_ps( a, _mm_mul_ps( b, c ) ); +} + +static inline b2FloatW b2MulSubW( b2FloatW a, b2FloatW b, b2FloatW c ) +{ + return _mm_sub_ps( a, _mm_mul_ps( b, c ) ); +} + +static inline b2FloatW b2MinW( b2FloatW a, b2FloatW b ) +{ + return _mm_min_ps( a, b ); +} + +static inline b2FloatW b2MaxW( b2FloatW a, b2FloatW b ) +{ + return _mm_max_ps( a, b ); +} + +static inline b2FloatW b2OrW( b2FloatW a, b2FloatW b ) +{ + return _mm_or_ps( a, b ); +} + +static inline b2FloatW b2GreaterThanW( b2FloatW a, b2FloatW b ) +{ + return _mm_cmpgt_ps( a, b ); +} + +static inline b2FloatW b2EqualsW( b2FloatW a, b2FloatW b ) +{ + return _mm_cmpeq_ps( a, b ); +} + +// component-wise returns mask ? b : a +static inline b2FloatW b2BlendW( b2FloatW a, b2FloatW b, b2FloatW mask ) +{ + return _mm_or_ps( _mm_and_ps( mask, b ), _mm_andnot_ps( mask, a ) ); +} + +static inline b2FloatW b2LoadW( const float* data ) +{ + return _mm_load_ps( data ); +} + +static inline void b2StoreW( float* data, b2FloatW a ) +{ + _mm_store_ps( data, a ); +} + +static inline b2FloatW b2UnpackLoW( b2FloatW a, b2FloatW b ) +{ + return _mm_unpacklo_ps( a, b ); +} + +static inline b2FloatW b2UnpackHiW( b2FloatW a, b2FloatW b ) +{ + return _mm_unpackhi_ps( a, b ); +} + +#else + +static inline b2FloatW b2ZeroW() +{ + return ( b2FloatW ){ 0.0f, 0.0f, 0.0f, 0.0f }; +} + +static inline b2FloatW b2SplatW( float scalar ) +{ + return ( b2FloatW ){ scalar, scalar, scalar, scalar }; +} + +static inline b2FloatW b2AddW( b2FloatW a, b2FloatW b ) +{ + return ( b2FloatW ){ a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w }; +} + +static inline b2FloatW b2SubW( b2FloatW a, b2FloatW b ) +{ + return ( b2FloatW ){ a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w }; +} + +static inline b2FloatW b2MulW( b2FloatW a, b2FloatW b ) +{ + return ( b2FloatW ){ a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w }; +} + +static inline b2FloatW b2MulAddW( b2FloatW a, b2FloatW b, b2FloatW c ) +{ + return ( b2FloatW ){ a.x + b.x * c.x, a.y + b.y * c.y, a.z + b.z * c.z, a.w + b.w * c.w }; +} + +static inline b2FloatW b2MulSubW( b2FloatW a, b2FloatW b, b2FloatW c ) +{ + return ( b2FloatW ){ a.x - b.x * c.x, a.y - b.y * c.y, a.z - b.z * c.z, a.w - b.w * c.w }; +} + +static inline b2FloatW b2MinW( b2FloatW a, b2FloatW b ) +{ + b2FloatW r; + r.x = a.x <= b.x ? a.x : b.x; + r.y = a.y <= b.y ? a.y : b.y; + r.z = a.z <= b.z ? a.z : b.z; + r.w = a.w <= b.w ? a.w : b.w; + return r; +} + +static inline b2FloatW b2MaxW( b2FloatW a, b2FloatW b ) +{ + b2FloatW r; + r.x = a.x >= b.x ? a.x : b.x; + r.y = a.y >= b.y ? a.y : b.y; + r.z = a.z >= b.z ? a.z : b.z; + r.w = a.w >= b.w ? a.w : b.w; + return r; +} + +static inline b2FloatW b2OrW( b2FloatW a, b2FloatW b ) +{ + b2FloatW r; + r.x = a.x != 0.0f || b.x != 0.0f ? 1.0f : 0.0f; + r.y = a.y != 0.0f || b.y != 0.0f ? 1.0f : 0.0f; + r.z = a.z != 0.0f || b.z != 0.0f ? 1.0f : 0.0f; + r.w = a.w != 0.0f || b.w != 0.0f ? 1.0f : 0.0f; + return r; +} + +static inline b2FloatW b2GreaterThanW( b2FloatW a, b2FloatW b ) +{ + b2FloatW r; + r.x = a.x > b.x ? 1.0f : 0.0f; + r.y = a.y > b.y ? 1.0f : 0.0f; + r.z = a.z > b.z ? 1.0f : 0.0f; + r.w = a.w > b.w ? 1.0f : 0.0f; + return r; +} + +static inline b2FloatW b2EqualsW( b2FloatW a, b2FloatW b ) +{ + b2FloatW r; + r.x = a.x == b.x ? 1.0f : 0.0f; + r.y = a.y == b.y ? 1.0f : 0.0f; + r.z = a.z == b.z ? 1.0f : 0.0f; + r.w = a.w == b.w ? 1.0f : 0.0f; + return r; +} + +// component-wise returns mask ? b : a +static inline b2FloatW b2BlendW( b2FloatW a, b2FloatW b, b2FloatW mask ) +{ + b2FloatW r; + r.x = mask.x != 0.0f ? b.x : a.x; + r.y = mask.y != 0.0f ? b.y : a.y; + r.z = mask.z != 0.0f ? b.z : a.z; + r.w = mask.w != 0.0f ? b.w : a.w; + return r; +} + +#endif static inline b2FloatW b2DotW( b2Vec2W a, b2Vec2W b ) { - return add( mul( a.X, b.X ), mul( a.Y, b.Y ) ); + return b2AddW( b2MulW( a.X, b.X ), b2MulW( a.Y, b.Y ) ); } static inline b2FloatW b2CrossW( b2Vec2W a, b2Vec2W b ) { - return sub( mul( a.X, b.Y ), mul( a.Y, b.X ) ); + return b2SubW( b2MulW( a.X, b.Y ), b2MulW( a.Y, b.X ) ); } static inline b2Vec2W b2RotateVectorW( b2RotW q, b2Vec2W v ) { - return ( b2Vec2W ){ sub( mul( q.C, v.X ), mul( q.S, v.Y ) ), add( mul( q.S, v.X ), mul( q.C, v.Y ) ) }; + return ( b2Vec2W ){ b2SubW( b2MulW( q.C, v.X ), b2MulW( q.S, v.Y ) ), b2AddW( b2MulW( q.S, v.X ), b2MulW( q.C, v.Y ) ) }; +} + +// Soft contact constraints with sub-stepping support +// Uses fixed anchors for Jacobians for better behavior on rolling shapes (circles & capsules) +// http://mmacklin.com/smallsteps.pdf +// https://box2d.org/files/ErinCatto_SoftConstraints_GDC2011.pdf + +typedef struct b2ContactConstraintSIMD +{ + int indexA[B2_SIMD_WIDTH]; + int indexB[B2_SIMD_WIDTH]; + + b2FloatW invMassA, invMassB; + b2FloatW invIA, invIB; + b2Vec2W normal; + b2FloatW friction; + b2FloatW biasRate; + b2FloatW massScale; + b2FloatW impulseScale; + b2Vec2W anchorA1, anchorB1; + b2FloatW normalMass1, tangentMass1; + b2FloatW baseSeparation1; + b2FloatW normalImpulse1; + b2FloatW maxNormalImpulse1; + b2FloatW tangentImpulse1; + b2Vec2W anchorA2, anchorB2; + b2FloatW baseSeparation2; + b2FloatW normalImpulse2; + b2FloatW maxNormalImpulse2; + b2FloatW tangentImpulse2; + b2FloatW normalMass2, tangentMass2; + b2FloatW restitution; + b2FloatW relativeVelocity1, relativeVelocity2; +} b2ContactConstraintSIMD; + +int b2GetContactConstraintSIMDByteCount( void ) +{ + return sizeof( b2ContactConstraintSIMD ); } // wide version of b2BodyState @@ -507,48 +925,51 @@ typedef struct b2SimdBody b2RotW dq; } b2SimdBody; +// Custom gather/scatter for each SIMD type +#if defined( B2_SIMD_AVX2 ) + // This is a load and 8x8 transpose static b2SimdBody b2GatherBodies( const b2BodyState* restrict states, int* restrict indices ) { _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); - // static const b2BodyState b2_identityBodyState = {{0.0f, 0.0f}, 0.0f, 0, {0.0f, 0.0f}, {1.0f, 0.0f}}; - b2FloatW identity = simde_mm256_setr_ps( 0.0f, 0.0f, 0.0f, 0, 0.0f, 0.0f, 1.0f, 0.0f ); - b2FloatW b0 = indices[0] == B2_NULL_INDEX ? identity : simde_mm256_load_ps( (float*)( states + indices[0] ) ); - b2FloatW b1 = indices[1] == B2_NULL_INDEX ? identity : simde_mm256_load_ps( (float*)( states + indices[1] ) ); - b2FloatW b2 = indices[2] == B2_NULL_INDEX ? identity : simde_mm256_load_ps( (float*)( states + indices[2] ) ); - b2FloatW b3 = indices[3] == B2_NULL_INDEX ? identity : simde_mm256_load_ps( (float*)( states + indices[3] ) ); - b2FloatW b4 = indices[4] == B2_NULL_INDEX ? identity : simde_mm256_load_ps( (float*)( states + indices[4] ) ); - b2FloatW b5 = indices[5] == B2_NULL_INDEX ? identity : simde_mm256_load_ps( (float*)( states + indices[5] ) ); - b2FloatW b6 = indices[6] == B2_NULL_INDEX ? identity : simde_mm256_load_ps( (float*)( states + indices[6] ) ); - b2FloatW b7 = indices[7] == B2_NULL_INDEX ? identity : simde_mm256_load_ps( (float*)( states + indices[7] ) ); - - b2FloatW t0 = simde_mm256_unpacklo_ps( b0, b1 ); - b2FloatW t1 = simde_mm256_unpackhi_ps( b0, b1 ); - b2FloatW t2 = simde_mm256_unpacklo_ps( b2, b3 ); - b2FloatW t3 = simde_mm256_unpackhi_ps( b2, b3 ); - b2FloatW t4 = simde_mm256_unpacklo_ps( b4, b5 ); - b2FloatW t5 = simde_mm256_unpackhi_ps( b4, b5 ); - b2FloatW t6 = simde_mm256_unpacklo_ps( b6, b7 ); - b2FloatW t7 = simde_mm256_unpackhi_ps( b6, b7 ); - b2FloatW tt0 = simde_mm256_shuffle_ps( t0, t2, SIMDE_MM_SHUFFLE( 1, 0, 1, 0 ) ); - b2FloatW tt1 = simde_mm256_shuffle_ps( t0, t2, SIMDE_MM_SHUFFLE( 3, 2, 3, 2 ) ); - b2FloatW tt2 = simde_mm256_shuffle_ps( t1, t3, SIMDE_MM_SHUFFLE( 1, 0, 1, 0 ) ); - b2FloatW tt3 = simde_mm256_shuffle_ps( t1, t3, SIMDE_MM_SHUFFLE( 3, 2, 3, 2 ) ); - b2FloatW tt4 = simde_mm256_shuffle_ps( t4, t6, SIMDE_MM_SHUFFLE( 1, 0, 1, 0 ) ); - b2FloatW tt5 = simde_mm256_shuffle_ps( t4, t6, SIMDE_MM_SHUFFLE( 3, 2, 3, 2 ) ); - b2FloatW tt6 = simde_mm256_shuffle_ps( t5, t7, SIMDE_MM_SHUFFLE( 1, 0, 1, 0 ) ); - b2FloatW tt7 = simde_mm256_shuffle_ps( t5, t7, SIMDE_MM_SHUFFLE( 3, 2, 3, 2 ) ); + // b2BodyState b2_identityBodyState = {{0.0f, 0.0f}, 0.0f, 0, {0.0f, 0.0f}, {1.0f, 0.0f}}; + b2FloatW identity = _mm256_setr_ps( 0.0f, 0.0f, 0.0f, 0, 0.0f, 0.0f, 1.0f, 0.0f ); + b2FloatW b0 = indices[0] == B2_NULL_INDEX ? identity : _mm256_load_ps( (float*)( states + indices[0] ) ); + b2FloatW b1 = indices[1] == B2_NULL_INDEX ? identity : _mm256_load_ps( (float*)( states + indices[1] ) ); + b2FloatW b2 = indices[2] == B2_NULL_INDEX ? identity : _mm256_load_ps( (float*)( states + indices[2] ) ); + b2FloatW b3 = indices[3] == B2_NULL_INDEX ? identity : _mm256_load_ps( (float*)( states + indices[3] ) ); + b2FloatW b4 = indices[4] == B2_NULL_INDEX ? identity : _mm256_load_ps( (float*)( states + indices[4] ) ); + b2FloatW b5 = indices[5] == B2_NULL_INDEX ? identity : _mm256_load_ps( (float*)( states + indices[5] ) ); + b2FloatW b6 = indices[6] == B2_NULL_INDEX ? identity : _mm256_load_ps( (float*)( states + indices[6] ) ); + b2FloatW b7 = indices[7] == B2_NULL_INDEX ? identity : _mm256_load_ps( (float*)( states + indices[7] ) ); + + b2FloatW t0 = _mm256_unpacklo_ps( b0, b1 ); + b2FloatW t1 = _mm256_unpackhi_ps( b0, b1 ); + b2FloatW t2 = _mm256_unpacklo_ps( b2, b3 ); + b2FloatW t3 = _mm256_unpackhi_ps( b2, b3 ); + b2FloatW t4 = _mm256_unpacklo_ps( b4, b5 ); + b2FloatW t5 = _mm256_unpackhi_ps( b4, b5 ); + b2FloatW t6 = _mm256_unpacklo_ps( b6, b7 ); + b2FloatW t7 = _mm256_unpackhi_ps( b6, b7 ); + b2FloatW tt0 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + b2FloatW tt1 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + b2FloatW tt2 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + b2FloatW tt3 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + b2FloatW tt4 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + b2FloatW tt5 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + b2FloatW tt6 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + b2FloatW tt7 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) ); b2SimdBody simdBody; - simdBody.v.X = simde_mm256_permute2f128_ps( tt0, tt4, 0x20 ); - simdBody.v.Y = simde_mm256_permute2f128_ps( tt1, tt5, 0x20 ); - simdBody.w = simde_mm256_permute2f128_ps( tt2, tt6, 0x20 ); - simdBody.flags = simde_mm256_permute2f128_ps( tt3, tt7, 0x20 ); - simdBody.dp.X = simde_mm256_permute2f128_ps( tt0, tt4, 0x31 ); - simdBody.dp.Y = simde_mm256_permute2f128_ps( tt1, tt5, 0x31 ); - simdBody.dq.C = simde_mm256_permute2f128_ps( tt2, tt6, 0x31 ); - simdBody.dq.S = simde_mm256_permute2f128_ps( tt3, tt7, 0x31 ); + simdBody.v.X = _mm256_permute2f128_ps( tt0, tt4, 0x20 ); + simdBody.v.Y = _mm256_permute2f128_ps( tt1, tt5, 0x20 ); + simdBody.w = _mm256_permute2f128_ps( tt2, tt6, 0x20 ); + simdBody.flags = _mm256_permute2f128_ps( tt3, tt7, 0x20 ); + simdBody.dp.X = _mm256_permute2f128_ps( tt0, tt4, 0x31 ); + simdBody.dp.Y = _mm256_permute2f128_ps( tt1, tt5, 0x31 ); + simdBody.dq.C = _mm256_permute2f128_ps( tt2, tt6, 0x31 ); + simdBody.dq.S = _mm256_permute2f128_ps( tt3, tt7, 0x31 ); return simdBody; } @@ -557,43 +978,308 @@ static void b2ScatterBodies( b2BodyState* restrict states, int* restrict indices { _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); - b2FloatW t0 = simde_mm256_unpacklo_ps( simdBody->v.X, simdBody->v.Y ); - b2FloatW t1 = simde_mm256_unpackhi_ps( simdBody->v.X, simdBody->v.Y ); - b2FloatW t2 = simde_mm256_unpacklo_ps( simdBody->w, simdBody->flags ); - b2FloatW t3 = simde_mm256_unpackhi_ps( simdBody->w, simdBody->flags ); - b2FloatW t4 = simde_mm256_unpacklo_ps( simdBody->dp.X, simdBody->dp.Y ); - b2FloatW t5 = simde_mm256_unpackhi_ps( simdBody->dp.X, simdBody->dp.Y ); - b2FloatW t6 = simde_mm256_unpacklo_ps( simdBody->dq.C, simdBody->dq.S ); - b2FloatW t7 = simde_mm256_unpackhi_ps( simdBody->dq.C, simdBody->dq.S ); - b2FloatW tt0 = simde_mm256_shuffle_ps( t0, t2, SIMDE_MM_SHUFFLE( 1, 0, 1, 0 ) ); - b2FloatW tt1 = simde_mm256_shuffle_ps( t0, t2, SIMDE_MM_SHUFFLE( 3, 2, 3, 2 ) ); - b2FloatW tt2 = simde_mm256_shuffle_ps( t1, t3, SIMDE_MM_SHUFFLE( 1, 0, 1, 0 ) ); - b2FloatW tt3 = simde_mm256_shuffle_ps( t1, t3, SIMDE_MM_SHUFFLE( 3, 2, 3, 2 ) ); - b2FloatW tt4 = simde_mm256_shuffle_ps( t4, t6, SIMDE_MM_SHUFFLE( 1, 0, 1, 0 ) ); - b2FloatW tt5 = simde_mm256_shuffle_ps( t4, t6, SIMDE_MM_SHUFFLE( 3, 2, 3, 2 ) ); - b2FloatW tt6 = simde_mm256_shuffle_ps( t5, t7, SIMDE_MM_SHUFFLE( 1, 0, 1, 0 ) ); - b2FloatW tt7 = simde_mm256_shuffle_ps( t5, t7, SIMDE_MM_SHUFFLE( 3, 2, 3, 2 ) ); + b2FloatW t0 = _mm256_unpacklo_ps( simdBody->v.X, simdBody->v.Y ); + b2FloatW t1 = _mm256_unpackhi_ps( simdBody->v.X, simdBody->v.Y ); + b2FloatW t2 = _mm256_unpacklo_ps( simdBody->w, simdBody->flags ); + b2FloatW t3 = _mm256_unpackhi_ps( simdBody->w, simdBody->flags ); + b2FloatW t4 = _mm256_unpacklo_ps( simdBody->dp.X, simdBody->dp.Y ); + b2FloatW t5 = _mm256_unpackhi_ps( simdBody->dp.X, simdBody->dp.Y ); + b2FloatW t6 = _mm256_unpacklo_ps( simdBody->dq.C, simdBody->dq.S ); + b2FloatW t7 = _mm256_unpackhi_ps( simdBody->dq.C, simdBody->dq.S ); + b2FloatW tt0 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + b2FloatW tt1 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + b2FloatW tt2 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + b2FloatW tt3 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + b2FloatW tt4 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + b2FloatW tt5 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + b2FloatW tt6 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + b2FloatW tt7 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // I don't use any dummy body in the body array because this will lead to multithreaded sharing and the // associated cache flushing. if ( indices[0] != B2_NULL_INDEX ) - simde_mm256_store_ps( (float*)( states + indices[0] ), simde_mm256_permute2f128_ps( tt0, tt4, 0x20 ) ); + _mm256_store_ps( (float*)( states + indices[0] ), _mm256_permute2f128_ps( tt0, tt4, 0x20 ) ); if ( indices[1] != B2_NULL_INDEX ) - simde_mm256_store_ps( (float*)( states + indices[1] ), simde_mm256_permute2f128_ps( tt1, tt5, 0x20 ) ); + _mm256_store_ps( (float*)( states + indices[1] ), _mm256_permute2f128_ps( tt1, tt5, 0x20 ) ); if ( indices[2] != B2_NULL_INDEX ) - simde_mm256_store_ps( (float*)( states + indices[2] ), simde_mm256_permute2f128_ps( tt2, tt6, 0x20 ) ); + _mm256_store_ps( (float*)( states + indices[2] ), _mm256_permute2f128_ps( tt2, tt6, 0x20 ) ); if ( indices[3] != B2_NULL_INDEX ) - simde_mm256_store_ps( (float*)( states + indices[3] ), simde_mm256_permute2f128_ps( tt3, tt7, 0x20 ) ); + _mm256_store_ps( (float*)( states + indices[3] ), _mm256_permute2f128_ps( tt3, tt7, 0x20 ) ); if ( indices[4] != B2_NULL_INDEX ) - simde_mm256_store_ps( (float*)( states + indices[4] ), simde_mm256_permute2f128_ps( tt0, tt4, 0x31 ) ); + _mm256_store_ps( (float*)( states + indices[4] ), _mm256_permute2f128_ps( tt0, tt4, 0x31 ) ); if ( indices[5] != B2_NULL_INDEX ) - simde_mm256_store_ps( (float*)( states + indices[5] ), simde_mm256_permute2f128_ps( tt1, tt5, 0x31 ) ); + _mm256_store_ps( (float*)( states + indices[5] ), _mm256_permute2f128_ps( tt1, tt5, 0x31 ) ); if ( indices[6] != B2_NULL_INDEX ) - simde_mm256_store_ps( (float*)( states + indices[6] ), simde_mm256_permute2f128_ps( tt2, tt6, 0x31 ) ); + _mm256_store_ps( (float*)( states + indices[6] ), _mm256_permute2f128_ps( tt2, tt6, 0x31 ) ); if ( indices[7] != B2_NULL_INDEX ) - simde_mm256_store_ps( (float*)( states + indices[7] ), simde_mm256_permute2f128_ps( tt3, tt7, 0x31 ) ); + _mm256_store_ps( (float*)( states + indices[7] ), _mm256_permute2f128_ps( tt3, tt7, 0x31 ) ); } +#elif defined( B2_SIMD_NEON ) + +// This is a load and transpose +static b2SimdBody b2GatherBodies( const b2BodyState* restrict states, int* restrict indices ) +{ + _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); + B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); + + // [vx vy w flags] + b2FloatW identityA = b2ZeroW(); + + // [dpx dpy dqc dqs] + + b2FloatW identityB = b2SetW( 0.0f, 0.0f, 1.0f, 0.0f ); + + b2FloatW b1a = indices[0] == B2_NULL_INDEX ? identityA : b2LoadW( (float*)( states + indices[0] ) + 0 ); + b2FloatW b1b = indices[0] == B2_NULL_INDEX ? identityB : b2LoadW( (float*)( states + indices[0] ) + 4 ); + b2FloatW b2a = indices[1] == B2_NULL_INDEX ? identityA : b2LoadW( (float*)( states + indices[1] ) + 0 ); + b2FloatW b2b = indices[1] == B2_NULL_INDEX ? identityB : b2LoadW( (float*)( states + indices[1] ) + 4 ); + b2FloatW b3a = indices[2] == B2_NULL_INDEX ? identityA : b2LoadW( (float*)( states + indices[2] ) + 0 ); + b2FloatW b3b = indices[2] == B2_NULL_INDEX ? identityB : b2LoadW( (float*)( states + indices[2] ) + 4 ); + b2FloatW b4a = indices[3] == B2_NULL_INDEX ? identityA : b2LoadW( (float*)( states + indices[3] ) + 0 ); + b2FloatW b4b = indices[3] == B2_NULL_INDEX ? identityB : b2LoadW( (float*)( states + indices[3] ) + 4 ); + + // [vx1 vx3 vy1 vy3] + b2FloatW t1a = b2UnpackLoW( b1a, b3a ); + + // [vx2 vx4 vy2 vy4] + b2FloatW t2a = b2UnpackLoW( b2a, b4a ); + + // [w1 w3 f1 f3] + b2FloatW t3a = b2UnpackHiW( b1a, b3a ); + + // [w2 w4 f2 f4] + b2FloatW t4a = b2UnpackHiW( b2a, b4a ); + + b2SimdBody simdBody; + simdBody.v.X = b2UnpackLoW( t1a, t2a ); + simdBody.v.Y = b2UnpackHiW( t1a, t2a ); + simdBody.w = b2UnpackLoW( t3a, t4a ); + simdBody.flags = b2UnpackHiW( t3a, t4a ); + + b2FloatW t1b = b2UnpackLoW( b1b, b3b ); + b2FloatW t2b = b2UnpackLoW( b2b, b4b ); + b2FloatW t3b = b2UnpackHiW( b1b, b3b ); + b2FloatW t4b = b2UnpackHiW( b2b, b4b ); + + simdBody.dp.X = b2UnpackLoW( t1b, t2b ); + simdBody.dp.Y = b2UnpackHiW( t1b, t2b ); + simdBody.dq.C = b2UnpackLoW( t3b, t4b ); + simdBody.dq.S = b2UnpackHiW( t3b, t4b ); + + return simdBody; +} + +// This writes only the velocities back to the solver bodies +// https://developer.arm.com/documentation/102107a/0100/Floating-point-4x4-matrix-transposition +static void b2ScatterBodies( b2BodyState* restrict states, int* restrict indices, const b2SimdBody* restrict simdBody ) +{ + _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); + B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); + + // b2FloatW x = b2SetW(0.0f, 1.0f, 2.0f, 3.0f); + // b2FloatW y = b2SetW(4.0f, 5.0f, 6.0f, 7.0f); + // b2FloatW z = b2SetW(8.0f, 9.0f, 10.0f, 11.0f); + // b2FloatW w = b2SetW(12.0f, 13.0f, 14.0f, 15.0f); + // + // float32x4x2_t rr1 = vtrnq_f32( x, y ); + // float32x4x2_t rr2 = vtrnq_f32( z, w ); + // + // float32x4_t b1 = vcombine_f32(vget_low_f32(rr1.val[0]), vget_low_f32(rr2.val[0])); + // float32x4_t b2 = vcombine_f32(vget_low_f32(rr1.val[1]), vget_low_f32(rr2.val[1])); + // float32x4_t b3 = vcombine_f32(vget_high_f32(rr1.val[0]), vget_high_f32(rr2.val[0])); + // float32x4_t b4 = vcombine_f32(vget_high_f32(rr1.val[1]), vget_high_f32(rr2.val[1])); + + // transpose + float32x4x2_t r1 = vtrnq_f32( simdBody->v.X, simdBody->v.Y ); + float32x4x2_t r2 = vtrnq_f32( simdBody->w, simdBody->flags ); + + // I don't use any dummy body in the body array because this will lead to multithreaded sharing and the + // associated cache flushing. + if ( indices[0] != B2_NULL_INDEX ) + { + float32x4_t body1 = vcombine_f32( vget_low_f32( r1.val[0] ), vget_low_f32( r2.val[0] ) ); + b2StoreW( (float*)( states + indices[0] ), body1 ); + } + + if ( indices[1] != B2_NULL_INDEX ) + { + float32x4_t body2 = vcombine_f32( vget_low_f32( r1.val[1] ), vget_low_f32( r2.val[1] ) ); + b2StoreW( (float*)( states + indices[1] ), body2 ); + } + + if ( indices[2] != B2_NULL_INDEX ) + { + float32x4_t body3 = vcombine_f32( vget_high_f32( r1.val[0] ), vget_high_f32( r2.val[0] ) ); + b2StoreW( (float*)( states + indices[2] ), body3 ); + } + + if ( indices[3] != B2_NULL_INDEX ) + { + float32x4_t body4 = vcombine_f32( vget_high_f32( r1.val[1] ), vget_high_f32( r2.val[1] ) ); + b2StoreW( (float*)( states + indices[3] ), body4 ); + } +} + +#elif defined( B2_SIMD_SSE2 ) + +// This is a load and transpose +static b2SimdBody b2GatherBodies( const b2BodyState* restrict states, int* restrict indices ) +{ + _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); + B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); + + // [vx vy w flags] + b2FloatW identityA = b2ZeroW(); + + // [dpx dpy dqc dqs] + b2FloatW identityB = b2SetW( 0.0f, 0.0f, 1.0f, 0.0f ); + + b2FloatW b1a = indices[0] == B2_NULL_INDEX ? identityA : b2LoadW( (float*)( states + indices[0] ) + 0 ); + b2FloatW b1b = indices[0] == B2_NULL_INDEX ? identityB : b2LoadW( (float*)( states + indices[0] ) + 4 ); + b2FloatW b2a = indices[1] == B2_NULL_INDEX ? identityA : b2LoadW( (float*)( states + indices[1] ) + 0 ); + b2FloatW b2b = indices[1] == B2_NULL_INDEX ? identityB : b2LoadW( (float*)( states + indices[1] ) + 4 ); + b2FloatW b3a = indices[2] == B2_NULL_INDEX ? identityA : b2LoadW( (float*)( states + indices[2] ) + 0 ); + b2FloatW b3b = indices[2] == B2_NULL_INDEX ? identityB : b2LoadW( (float*)( states + indices[2] ) + 4 ); + b2FloatW b4a = indices[3] == B2_NULL_INDEX ? identityA : b2LoadW( (float*)( states + indices[3] ) + 0 ); + b2FloatW b4b = indices[3] == B2_NULL_INDEX ? identityB : b2LoadW( (float*)( states + indices[3] ) + 4 ); + + // [vx1 vx3 vy1 vy3] + b2FloatW t1a = b2UnpackLoW( b1a, b3a ); + + // [vx2 vx4 vy2 vy4] + b2FloatW t2a = b2UnpackLoW( b2a, b4a ); + + // [w1 w3 f1 f3] + b2FloatW t3a = b2UnpackHiW( b1a, b3a ); + + // [w2 w4 f2 f4] + b2FloatW t4a = b2UnpackHiW( b2a, b4a ); + + b2SimdBody simdBody; + simdBody.v.X = b2UnpackLoW( t1a, t2a ); + simdBody.v.Y = b2UnpackHiW( t1a, t2a ); + simdBody.w = b2UnpackLoW( t3a, t4a ); + simdBody.flags = b2UnpackHiW( t3a, t4a ); + + b2FloatW t1b = b2UnpackLoW( b1b, b3b ); + b2FloatW t2b = b2UnpackLoW( b2b, b4b ); + b2FloatW t3b = b2UnpackHiW( b1b, b3b ); + b2FloatW t4b = b2UnpackHiW( b2b, b4b ); + + simdBody.dp.X = b2UnpackLoW( t1b, t2b ); + simdBody.dp.Y = b2UnpackHiW( t1b, t2b ); + simdBody.dq.C = b2UnpackLoW( t3b, t4b ); + simdBody.dq.S = b2UnpackHiW( t3b, t4b ); + + return simdBody; +} + +// This writes only the velocities back to the solver bodies +static void b2ScatterBodies( b2BodyState* restrict states, int* restrict indices, const b2SimdBody* restrict simdBody ) +{ + _Static_assert( sizeof( b2BodyState ) == 32, "b2BodyState not 32 bytes" ); + B2_ASSERT( ( (uintptr_t)states & 0x1F ) == 0 ); + + // [vx1 vy1 vx2 vy2] + b2FloatW t1 = b2UnpackLoW( simdBody->v.X, simdBody->v.Y ); + // [vx3 vy3 vx4 vy4] + b2FloatW t2 = b2UnpackHiW( simdBody->v.X, simdBody->v.Y ); + // [w1 f1 w2 f2] + b2FloatW t3 = b2UnpackLoW( simdBody->w, simdBody->flags ); + // [w3 f3 w4 f4] + b2FloatW t4 = b2UnpackHiW( simdBody->w, simdBody->flags ); + + // I don't use any dummy body in the body array because this will lead to multithreaded sharing and the + // associated cache flushing. + if ( indices[0] != B2_NULL_INDEX ) + { + // [t1.x t1.y t3.x t3.y] + b2StoreW( (float*)( states + indices[0] ), _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ) ); + } + + if ( indices[1] != B2_NULL_INDEX ) + { + // [t1.z t1.w t3.z t3.w] + b2StoreW( (float*)( states + indices[1] ), _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) ) ); + } + + if ( indices[2] != B2_NULL_INDEX ) + { + // [t2.x t2.y t4.x t4.y] + b2StoreW( (float*)( states + indices[2] ), _mm_shuffle_ps( t2, t4, _MM_SHUFFLE( 1, 0, 1, 0 ) ) ); + } + + if ( indices[3] != B2_NULL_INDEX ) + { + // [t2.z t2.w t4.z t4.w] + b2StoreW( (float*)( states + indices[3] ), _mm_shuffle_ps( t2, t4, _MM_SHUFFLE( 3, 2, 3, 2 ) ) ); + } +} + +#else + +// This is a load and transpose +static b2SimdBody b2GatherBodies( const b2BodyState* restrict states, int* restrict indices ) +{ + b2BodyState identity = b2_identityBodyState; + + b2BodyState s1 = indices[0] == B2_NULL_INDEX ? identity : states[indices[0]]; + b2BodyState s2 = indices[1] == B2_NULL_INDEX ? identity : states[indices[1]]; + b2BodyState s3 = indices[2] == B2_NULL_INDEX ? identity : states[indices[2]]; + b2BodyState s4 = indices[3] == B2_NULL_INDEX ? identity : states[indices[3]]; + + b2SimdBody simdBody; + simdBody.v.X = ( b2FloatW ){ s1.linearVelocity.x, s2.linearVelocity.x, s3.linearVelocity.x, s4.linearVelocity.x }; + simdBody.v.Y = ( b2FloatW ){ s1.linearVelocity.y, s2.linearVelocity.y, s3.linearVelocity.y, s4.linearVelocity.y }; + simdBody.w = ( b2FloatW ){ s1.angularVelocity, s2.angularVelocity, s3.angularVelocity, s4.angularVelocity }; + simdBody.flags = ( b2FloatW ){ (float)s1.flags, (float)s2.flags, (float)s3.flags, (float)s4.flags }; + simdBody.dp.X = ( b2FloatW ){ s1.deltaPosition.x, s2.deltaPosition.x, s3.deltaPosition.x, s4.deltaPosition.x }; + simdBody.dp.Y = ( b2FloatW ){ s1.deltaPosition.y, s2.deltaPosition.y, s3.deltaPosition.y, s4.deltaPosition.y }; + simdBody.dq.C = ( b2FloatW ){ s1.deltaRotation.c, s2.deltaRotation.c, s3.deltaRotation.c, s4.deltaRotation.c }; + simdBody.dq.S = ( b2FloatW ){ s1.deltaRotation.s, s2.deltaRotation.s, s3.deltaRotation.s, s4.deltaRotation.s }; + + return simdBody; +} + +// This writes only the velocities back to the solver bodies +static void b2ScatterBodies( b2BodyState* restrict states, int* restrict indices, const b2SimdBody* restrict simdBody ) +{ + if ( indices[0] != B2_NULL_INDEX ) + { + b2BodyState* state = states + indices[0]; + state->linearVelocity.x = simdBody->v.X.x; + state->linearVelocity.y = simdBody->v.Y.x; + state->angularVelocity = simdBody->w.x; + } + + if ( indices[1] != B2_NULL_INDEX ) + { + b2BodyState* state = states + indices[1]; + state->linearVelocity.x = simdBody->v.X.y; + state->linearVelocity.y = simdBody->v.Y.y; + state->angularVelocity = simdBody->w.y; + } + + if ( indices[2] != B2_NULL_INDEX ) + { + b2BodyState* state = states + indices[2]; + state->linearVelocity.x = simdBody->v.X.z; + state->linearVelocity.y = simdBody->v.Y.z; + state->angularVelocity = simdBody->w.z; + } + + if ( indices[3] != B2_NULL_INDEX ) + { + b2BodyState* state = states + indices[3]; + state->linearVelocity.x = simdBody->v.X.w; + state->linearVelocity.y = simdBody->v.Y.w; + state->angularVelocity = simdBody->w.w; + } +} + +#endif + void b2PrepareContactsTask( int startIndex, int endIndex, b2StepContext* context ) { b2TracyCZoneNC( prepare_contact, "Prepare Contact", b2_colorYellow, true ); @@ -615,9 +1301,9 @@ void b2PrepareContactsTask( int startIndex, int endIndex, b2StepContext* context { b2ContactConstraintSIMD* constraint = constraints + i; - for ( int j = 0; j < 8; ++j ) + for ( int j = 0; j < B2_SIMD_WIDTH; ++j ) { - b2ContactSim* contactSim = contacts[8 * i + j]; + b2ContactSim* contactSim = contacts[B2_SIMD_WIDTH * i + j]; if ( contactSim != NULL ) { @@ -769,36 +1455,43 @@ void b2PrepareContactsTask( int startIndex, int endIndex, b2StepContext* context // SIMD remainder constraint->indexA[j] = B2_NULL_INDEX; constraint->indexB[j] = B2_NULL_INDEX; + + ( (float*)&constraint->invMassA )[j] = 0.0f; + ( (float*)&constraint->invMassB )[j] = 0.0f; + ( (float*)&constraint->invIA )[j] = 0.0f; + ( (float*)&constraint->invIB )[j] = 0.0f; + + ( (float*)&constraint->normal.X )[j] = 0.0f; + ( (float*)&constraint->normal.Y )[j] = 0.0f; ( (float*)&constraint->friction )[j] = 0.0f; - ( (float*)&constraint->restitution )[j] = 0.0f; ( (float*)&constraint->biasRate )[j] = 0.0f; ( (float*)&constraint->massScale )[j] = 0.0f; ( (float*)&constraint->impulseScale )[j] = 0.0f; - ( (float*)&constraint->normal.X )[j] = 0.0f; - ( (float*)&constraint->normal.Y )[j] = 0.0f; - ( (float*)&constraint->baseSeparation1 )[j] = 0.0f; - ( (float*)&constraint->normalImpulse1 )[j] = 0.0f; - ( (float*)&constraint->tangentImpulse1 )[j] = 0.0f; - ( (float*)&constraint->maxNormalImpulse1 )[j] = 0.0f; ( (float*)&constraint->anchorA1.X )[j] = 0.0f; ( (float*)&constraint->anchorA1.Y )[j] = 0.0f; ( (float*)&constraint->anchorB1.X )[j] = 0.0f; ( (float*)&constraint->anchorB1.Y )[j] = 0.0f; + ( (float*)&constraint->baseSeparation1 )[j] = 0.0f; + ( (float*)&constraint->normalImpulse1 )[j] = 0.0f; + ( (float*)&constraint->tangentImpulse1 )[j] = 0.0f; + ( (float*)&constraint->maxNormalImpulse1 )[j] = 0.0f; ( (float*)&constraint->normalMass1 )[j] = 0.0f; ( (float*)&constraint->tangentMass1 )[j] = 0.0f; - ( (float*)&constraint->relativeVelocity1 )[j] = 0.0f; - ( (float*)&constraint->baseSeparation2 )[j] = 0.0f; - ( (float*)&constraint->normalImpulse2 )[j] = 0.0f; - ( (float*)&constraint->tangentImpulse2 )[j] = 0.0f; - ( (float*)&constraint->maxNormalImpulse2 )[j] = 0.0f; ( (float*)&constraint->anchorA2.X )[j] = 0.0f; ( (float*)&constraint->anchorA2.Y )[j] = 0.0f; ( (float*)&constraint->anchorB2.X )[j] = 0.0f; ( (float*)&constraint->anchorB2.Y )[j] = 0.0f; + ( (float*)&constraint->baseSeparation2 )[j] = 0.0f; + ( (float*)&constraint->normalImpulse2 )[j] = 0.0f; + ( (float*)&constraint->tangentImpulse2 )[j] = 0.0f; + ( (float*)&constraint->maxNormalImpulse2 )[j] = 0.0f; ( (float*)&constraint->normalMass2 )[j] = 0.0f; ( (float*)&constraint->tangentMass2 )[j] = 0.0f; + + ( (float*)&constraint->restitution )[j] = 0.0f; + ( (float*)&constraint->relativeVelocity1 )[j] = 0.0f; ( (float*)&constraint->relativeVelocity2 )[j] = 0.0f; } } @@ -821,7 +1514,7 @@ void b2WarmStartContactsTask( int startIndex, int endIndex, b2StepContext* conte b2SimdBody bB = b2GatherBodies( states, c->indexB ); b2FloatW tangentX = c->normal.Y; - b2FloatW tangentY = sub( simde_mm256_setzero_ps(), c->normal.X ); + b2FloatW tangentY = b2SubW( b2ZeroW(), c->normal.X ); { // fixed anchors @@ -829,14 +1522,14 @@ void b2WarmStartContactsTask( int startIndex, int endIndex, b2StepContext* conte b2Vec2W rB = c->anchorB1; b2Vec2W P; - P.X = add( mul( c->normalImpulse1, c->normal.X ), mul( c->tangentImpulse1, tangentX ) ); - P.Y = add( mul( c->normalImpulse1, c->normal.Y ), mul( c->tangentImpulse1, tangentY ) ); - bA.w = mulsub( bA.w, c->invIA, b2CrossW( rA, P ) ); - bA.v.X = mulsub( bA.v.X, c->invMassA, P.X ); - bA.v.Y = mulsub( bA.v.Y, c->invMassA, P.Y ); - bB.w = muladd( bB.w, c->invIB, b2CrossW( rB, P ) ); - bB.v.X = muladd( bB.v.X, c->invMassB, P.X ); - bB.v.Y = muladd( bB.v.Y, c->invMassB, P.Y ); + P.X = b2AddW( b2MulW( c->normalImpulse1, c->normal.X ), b2MulW( c->tangentImpulse1, tangentX ) ); + P.Y = b2AddW( b2MulW( c->normalImpulse1, c->normal.Y ), b2MulW( c->tangentImpulse1, tangentY ) ); + bA.w = b2MulSubW( bA.w, c->invIA, b2CrossW( rA, P ) ); + bA.v.X = b2MulSubW( bA.v.X, c->invMassA, P.X ); + bA.v.Y = b2MulSubW( bA.v.Y, c->invMassA, P.Y ); + bB.w = b2MulAddW( bB.w, c->invIB, b2CrossW( rB, P ) ); + bB.v.X = b2MulAddW( bB.v.X, c->invMassB, P.X ); + bB.v.Y = b2MulAddW( bB.v.Y, c->invMassB, P.Y ); } { @@ -845,14 +1538,14 @@ void b2WarmStartContactsTask( int startIndex, int endIndex, b2StepContext* conte b2Vec2W rB = c->anchorB2; b2Vec2W P; - P.X = add( mul( c->normalImpulse2, c->normal.X ), mul( c->tangentImpulse2, tangentX ) ); - P.Y = add( mul( c->normalImpulse2, c->normal.Y ), mul( c->tangentImpulse2, tangentY ) ); - bA.w = mulsub( bA.w, c->invIA, b2CrossW( rA, P ) ); - bA.v.X = mulsub( bA.v.X, c->invMassA, P.X ); - bA.v.Y = mulsub( bA.v.Y, c->invMassA, P.Y ); - bB.w = muladd( bB.w, c->invIB, b2CrossW( rB, P ) ); - bB.v.X = muladd( bB.v.X, c->invMassB, P.X ); - bB.v.Y = muladd( bB.v.Y, c->invMassB, P.Y ); + P.X = b2AddW( b2MulW( c->normalImpulse2, c->normal.X ), b2MulW( c->tangentImpulse2, tangentX ) ); + P.Y = b2AddW( b2MulW( c->normalImpulse2, c->normal.Y ), b2MulW( c->tangentImpulse2, tangentY ) ); + bA.w = b2MulSubW( bA.w, c->invIA, b2CrossW( rA, P ) ); + bA.v.X = b2MulSubW( bA.v.X, c->invMassA, P.X ); + bA.v.Y = b2MulSubW( bA.v.Y, c->invMassA, P.Y ); + bB.w = b2MulAddW( bB.w, c->invIB, b2CrossW( rB, P ) ); + bB.v.X = b2MulAddW( bB.v.X, c->invMassB, P.X ); + bB.v.Y = b2MulAddW( bB.v.Y, c->invMassB, P.Y ); } b2ScatterBodies( states, c->indexA, &bA ); @@ -868,8 +1561,8 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, b2BodyState* states = context->states; b2ContactConstraintSIMD* constraints = context->graph->colors[colorIndex].simdConstraints; - b2FloatW inv_h = simde_mm256_set1_ps( context->inv_h ); - b2FloatW minBiasVel = simde_mm256_set1_ps( -context->world->contactPushoutVelocity ); + b2FloatW inv_h = b2SplatW( context->inv_h ); + b2FloatW minBiasVel = b2SplatW( -context->world->contactPushoutVelocity ); for ( int i = startIndex; i < endIndex; ++i ) { @@ -887,12 +1580,12 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, } else { - biasRate = simde_mm256_setzero_ps(); - massScale = simde_mm256_set1_ps( 1.0f ); - impulseScale = simde_mm256_setzero_ps(); + biasRate = b2ZeroW(); + massScale = b2SplatW( 1.0f ); + impulseScale = b2ZeroW(); } - b2Vec2W dp = { sub( bB.dp.X, bA.dp.X ), sub( bB.dp.Y, bA.dp.Y ) }; + b2Vec2W dp = { b2SubW( bB.dp.X, bA.dp.X ), b2SubW( bB.dp.Y, bA.dp.Y ) }; // point1 non-penetration constraint { @@ -901,48 +1594,46 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, b2Vec2W rsB = b2RotateVectorW( bB.dq, c->anchorB1 ); // compute current separation - // todo this is subject to round-off error if the anchor is far from the body center of mass - // todo for example a large world with a single static body and many offset shapes - b2Vec2W ds = { add( dp.X, sub( rsB.X, rsA.X ) ), add( dp.Y, sub( rsB.Y, rsA.Y ) ) }; - b2FloatW s = add( b2DotW( c->normal, ds ), c->baseSeparation1 ); - - b2FloatW test = simde_mm256_cmp_ps( s, simde_mm256_setzero_ps(), SIMDE_CMP_GT_OQ ); - b2FloatW specBias = mul( s, inv_h ); - b2FloatW softBias = simde_mm256_max_ps( mul( biasRate, s ), minBiasVel ); + // this is subject to round-off error if the anchor is far from the body center of mass + b2Vec2W ds = { b2AddW( dp.X, b2SubW( rsB.X, rsA.X ) ), b2AddW( dp.Y, b2SubW( rsB.Y, rsA.Y ) ) }; + b2FloatW s = b2AddW( b2DotW( c->normal, ds ), c->baseSeparation1 ); - // #todo slow on SSE2 - b2FloatW bias = simde_mm256_blendv_ps( softBias, specBias, test ); + // Apply speculative bias if separation is greater than zero, otherwise apply soft constraint bias + b2FloatW mask = b2GreaterThanW( s, b2ZeroW() ); + b2FloatW specBias = b2MulW( s, inv_h ); + b2FloatW softBias = b2MaxW( b2MulW( biasRate, s ), minBiasVel ); + b2FloatW bias = b2BlendW( softBias, specBias, mask ); // fixed anchors for Jacobians b2Vec2W rA = c->anchorA1; b2Vec2W rB = c->anchorB1; // Relative velocity at contact - b2FloatW dvx = sub( sub( bB.v.X, mul( bB.w, rB.Y ) ), sub( bA.v.X, mul( bA.w, rA.Y ) ) ); - b2FloatW dvy = sub( add( bB.v.Y, mul( bB.w, rB.X ) ), add( bA.v.Y, mul( bA.w, rA.X ) ) ); - b2FloatW vn = add( mul( dvx, c->normal.X ), mul( dvy, c->normal.Y ) ); + b2FloatW dvx = b2SubW( b2SubW( bB.v.X, b2MulW( bB.w, rB.Y ) ), b2SubW( bA.v.X, b2MulW( bA.w, rA.Y ) ) ); + b2FloatW dvy = b2SubW( b2AddW( bB.v.Y, b2MulW( bB.w, rB.X ) ), b2AddW( bA.v.Y, b2MulW( bA.w, rA.X ) ) ); + b2FloatW vn = b2AddW( b2MulW( dvx, c->normal.X ), b2MulW( dvy, c->normal.Y ) ); // Compute normal impulse - b2FloatW negImpulse = - add( mul( c->normalMass1, mul( massScale, add( vn, bias ) ) ), mul( impulseScale, c->normalImpulse1 ) ); + b2FloatW negImpulse = b2AddW( b2MulW( c->normalMass1, b2MulW( massScale, b2AddW( vn, bias ) ) ), + b2MulW( impulseScale, c->normalImpulse1 ) ); // Clamp the accumulated impulse - b2FloatW newImpulse = simde_mm256_max_ps( sub( c->normalImpulse1, negImpulse ), simde_mm256_setzero_ps() ); - b2FloatW impulse = sub( newImpulse, c->normalImpulse1 ); + b2FloatW newImpulse = b2MaxW( b2SubW( c->normalImpulse1, negImpulse ), b2ZeroW() ); + b2FloatW impulse = b2SubW( newImpulse, c->normalImpulse1 ); c->normalImpulse1 = newImpulse; - c->maxNormalImpulse1 = simde_mm256_max_ps( c->maxNormalImpulse1, newImpulse ); + c->maxNormalImpulse1 = b2MaxW( c->maxNormalImpulse1, newImpulse ); // Apply contact impulse - b2FloatW Px = mul( impulse, c->normal.X ); - b2FloatW Py = mul( impulse, c->normal.Y ); + b2FloatW Px = b2MulW( impulse, c->normal.X ); + b2FloatW Py = b2MulW( impulse, c->normal.Y ); - bA.v.X = mulsub( bA.v.X, c->invMassA, Px ); - bA.v.Y = mulsub( bA.v.Y, c->invMassA, Py ); - bA.w = mulsub( bA.w, c->invIA, sub( mul( rA.X, Py ), mul( rA.Y, Px ) ) ); + bA.v.X = b2MulSubW( bA.v.X, c->invMassA, Px ); + bA.v.Y = b2MulSubW( bA.v.Y, c->invMassA, Py ); + bA.w = b2MulSubW( bA.w, c->invIA, b2SubW( b2MulW( rA.X, Py ), b2MulW( rA.Y, Px ) ) ); - bB.v.X = muladd( bB.v.X, c->invMassB, Px ); - bB.v.Y = muladd( bB.v.Y, c->invMassB, Py ); - bB.w = muladd( bB.w, c->invIB, sub( mul( rB.X, Py ), mul( rB.Y, Px ) ) ); + bB.v.X = b2MulAddW( bB.v.X, c->invMassB, Px ); + bB.v.Y = b2MulAddW( bB.v.Y, c->invMassB, Py ); + bB.w = b2MulAddW( bB.w, c->invIB, b2SubW( b2MulW( rB.X, Py ), b2MulW( rB.Y, Px ) ) ); } // second point non-penetration constraint @@ -952,50 +1643,48 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, b2Vec2W rsB = b2RotateVectorW( bB.dq, c->anchorB2 ); // compute current separation - b2Vec2W ds = { add( dp.X, sub( rsB.X, rsA.X ) ), add( dp.Y, sub( rsB.Y, rsA.Y ) ) }; - b2FloatW s = add( b2DotW( c->normal, ds ), c->baseSeparation2 ); + b2Vec2W ds = { b2AddW( dp.X, b2SubW( rsB.X, rsA.X ) ), b2AddW( dp.Y, b2SubW( rsB.Y, rsA.Y ) ) }; + b2FloatW s = b2AddW( b2DotW( c->normal, ds ), c->baseSeparation2 ); - b2FloatW test = simde_mm256_cmp_ps( s, simde_mm256_setzero_ps(), SIMDE_CMP_GT_OQ ); - b2FloatW specBias = mul( s, inv_h ); - b2FloatW softBias = simde_mm256_max_ps( mul( biasRate, s ), minBiasVel ); - - // #todo slow on SSE2 - b2FloatW bias = simde_mm256_blendv_ps( softBias, specBias, test ); + b2FloatW mask = b2GreaterThanW( s, b2ZeroW() ); + b2FloatW specBias = b2MulW( s, inv_h ); + b2FloatW softBias = b2MaxW( b2MulW( biasRate, s ), minBiasVel ); + b2FloatW bias = b2BlendW( softBias, specBias, mask ); // fixed anchors for Jacobians b2Vec2W rA = c->anchorA2; b2Vec2W rB = c->anchorB2; // Relative velocity at contact - b2FloatW dvx = sub( sub( bB.v.X, mul( bB.w, rB.Y ) ), sub( bA.v.X, mul( bA.w, rA.Y ) ) ); - b2FloatW dvy = sub( add( bB.v.Y, mul( bB.w, rB.X ) ), add( bA.v.Y, mul( bA.w, rA.X ) ) ); - b2FloatW vn = add( mul( dvx, c->normal.X ), mul( dvy, c->normal.Y ) ); + b2FloatW dvx = b2SubW( b2SubW( bB.v.X, b2MulW( bB.w, rB.Y ) ), b2SubW( bA.v.X, b2MulW( bA.w, rA.Y ) ) ); + b2FloatW dvy = b2SubW( b2AddW( bB.v.Y, b2MulW( bB.w, rB.X ) ), b2AddW( bA.v.Y, b2MulW( bA.w, rA.X ) ) ); + b2FloatW vn = b2AddW( b2MulW( dvx, c->normal.X ), b2MulW( dvy, c->normal.Y ) ); // Compute normal impulse - b2FloatW negImpulse = - add( mul( c->normalMass2, mul( massScale, add( vn, bias ) ) ), mul( impulseScale, c->normalImpulse2 ) ); + b2FloatW negImpulse = b2AddW( b2MulW( c->normalMass2, b2MulW( massScale, b2AddW( vn, bias ) ) ), + b2MulW( impulseScale, c->normalImpulse2 ) ); // Clamp the accumulated impulse - b2FloatW newImpulse = simde_mm256_max_ps( sub( c->normalImpulse2, negImpulse ), simde_mm256_setzero_ps() ); - b2FloatW impulse = sub( newImpulse, c->normalImpulse2 ); + b2FloatW newImpulse = b2MaxW( b2SubW( c->normalImpulse2, negImpulse ), b2ZeroW() ); + b2FloatW impulse = b2SubW( newImpulse, c->normalImpulse2 ); c->normalImpulse2 = newImpulse; - c->maxNormalImpulse2 = simde_mm256_max_ps( c->maxNormalImpulse2, newImpulse ); + c->maxNormalImpulse2 = b2MaxW( c->maxNormalImpulse2, newImpulse ); // Apply contact impulse - b2FloatW Px = mul( impulse, c->normal.X ); - b2FloatW Py = mul( impulse, c->normal.Y ); + b2FloatW Px = b2MulW( impulse, c->normal.X ); + b2FloatW Py = b2MulW( impulse, c->normal.Y ); - bA.v.X = mulsub( bA.v.X, c->invMassA, Px ); - bA.v.Y = mulsub( bA.v.Y, c->invMassA, Py ); - bA.w = mulsub( bA.w, c->invIA, sub( mul( rA.X, Py ), mul( rA.Y, Px ) ) ); + bA.v.X = b2MulSubW( bA.v.X, c->invMassA, Px ); + bA.v.Y = b2MulSubW( bA.v.Y, c->invMassA, Py ); + bA.w = b2MulSubW( bA.w, c->invIA, b2SubW( b2MulW( rA.X, Py ), b2MulW( rA.Y, Px ) ) ); - bB.v.X = muladd( bB.v.X, c->invMassB, Px ); - bB.v.Y = muladd( bB.v.Y, c->invMassB, Py ); - bB.w = muladd( bB.w, c->invIB, sub( mul( rB.X, Py ), mul( rB.Y, Px ) ) ); + bB.v.X = b2MulAddW( bB.v.X, c->invMassB, Px ); + bB.v.Y = b2MulAddW( bB.v.Y, c->invMassB, Py ); + bB.w = b2MulAddW( bB.w, c->invIB, b2SubW( b2MulW( rB.X, Py ), b2MulW( rB.Y, Px ) ) ); } b2FloatW tangentX = c->normal.Y; - b2FloatW tangentY = sub( simde_mm256_setzero_ps(), c->normal.X ); + b2FloatW tangentY = b2SubW( b2ZeroW(), c->normal.X ); // point 1 friction constraint { @@ -1004,32 +1693,31 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, b2Vec2W rB = c->anchorB1; // Relative velocity at contact - b2FloatW dvx = sub( sub( bB.v.X, mul( bB.w, rB.Y ) ), sub( bA.v.X, mul( bA.w, rA.Y ) ) ); - b2FloatW dvy = sub( add( bB.v.Y, mul( bB.w, rB.X ) ), add( bA.v.Y, mul( bA.w, rA.X ) ) ); - b2FloatW vt = add( mul( dvx, tangentX ), mul( dvy, tangentY ) ); + b2FloatW dvx = b2SubW( b2SubW( bB.v.X, b2MulW( bB.w, rB.Y ) ), b2SubW( bA.v.X, b2MulW( bA.w, rA.Y ) ) ); + b2FloatW dvy = b2SubW( b2AddW( bB.v.Y, b2MulW( bB.w, rB.X ) ), b2AddW( bA.v.Y, b2MulW( bA.w, rA.X ) ) ); + b2FloatW vt = b2AddW( b2MulW( dvx, tangentX ), b2MulW( dvy, tangentY ) ); // Compute tangent force - b2FloatW negImpulse = mul( c->tangentMass1, vt ); + b2FloatW negImpulse = b2MulW( c->tangentMass1, vt ); // Clamp the accumulated force - b2FloatW maxFriction = mul( c->friction, c->normalImpulse1 ); - b2FloatW newImpulse = sub( c->tangentImpulse1, negImpulse ); - newImpulse = - simde_mm256_max_ps( sub( simde_mm256_setzero_ps(), maxFriction ), simde_mm256_min_ps( newImpulse, maxFriction ) ); - b2FloatW impulse = sub( newImpulse, c->tangentImpulse1 ); + b2FloatW maxFriction = b2MulW( c->friction, c->normalImpulse1 ); + b2FloatW newImpulse = b2SubW( c->tangentImpulse1, negImpulse ); + newImpulse = b2MaxW( b2SubW( b2ZeroW(), maxFriction ), b2MinW( newImpulse, maxFriction ) ); + b2FloatW impulse = b2SubW( newImpulse, c->tangentImpulse1 ); c->tangentImpulse1 = newImpulse; // Apply contact impulse - b2FloatW Px = mul( impulse, tangentX ); - b2FloatW Py = mul( impulse, tangentY ); + b2FloatW Px = b2MulW( impulse, tangentX ); + b2FloatW Py = b2MulW( impulse, tangentY ); - bA.v.X = mulsub( bA.v.X, c->invMassA, Px ); - bA.v.Y = mulsub( bA.v.Y, c->invMassA, Py ); - bA.w = mulsub( bA.w, c->invIA, sub( mul( rA.X, Py ), mul( rA.Y, Px ) ) ); + bA.v.X = b2MulSubW( bA.v.X, c->invMassA, Px ); + bA.v.Y = b2MulSubW( bA.v.Y, c->invMassA, Py ); + bA.w = b2MulSubW( bA.w, c->invIA, b2SubW( b2MulW( rA.X, Py ), b2MulW( rA.Y, Px ) ) ); - bB.v.X = muladd( bB.v.X, c->invMassB, Px ); - bB.v.Y = muladd( bB.v.Y, c->invMassB, Py ); - bB.w = muladd( bB.w, c->invIB, sub( mul( rB.X, Py ), mul( rB.Y, Px ) ) ); + bB.v.X = b2MulAddW( bB.v.X, c->invMassB, Px ); + bB.v.Y = b2MulAddW( bB.v.Y, c->invMassB, Py ); + bB.w = b2MulAddW( bB.w, c->invIB, b2SubW( b2MulW( rB.X, Py ), b2MulW( rB.Y, Px ) ) ); } // second point friction constraint @@ -1039,32 +1727,31 @@ void b2SolveContactsTask( int startIndex, int endIndex, b2StepContext* context, b2Vec2W rB = c->anchorB2; // Relative velocity at contact - b2FloatW dvx = sub( sub( bB.v.X, mul( bB.w, rB.Y ) ), sub( bA.v.X, mul( bA.w, rA.Y ) ) ); - b2FloatW dvy = sub( add( bB.v.Y, mul( bB.w, rB.X ) ), add( bA.v.Y, mul( bA.w, rA.X ) ) ); - b2FloatW vt = add( mul( dvx, tangentX ), mul( dvy, tangentY ) ); + b2FloatW dvx = b2SubW( b2SubW( bB.v.X, b2MulW( bB.w, rB.Y ) ), b2SubW( bA.v.X, b2MulW( bA.w, rA.Y ) ) ); + b2FloatW dvy = b2SubW( b2AddW( bB.v.Y, b2MulW( bB.w, rB.X ) ), b2AddW( bA.v.Y, b2MulW( bA.w, rA.X ) ) ); + b2FloatW vt = b2AddW( b2MulW( dvx, tangentX ), b2MulW( dvy, tangentY ) ); // Compute tangent force - b2FloatW negImpulse = mul( c->tangentMass2, vt ); + b2FloatW negImpulse = b2MulW( c->tangentMass2, vt ); // Clamp the accumulated force - b2FloatW maxFriction = mul( c->friction, c->normalImpulse2 ); - b2FloatW newImpulse = sub( c->tangentImpulse2, negImpulse ); - newImpulse = - simde_mm256_max_ps( sub( simde_mm256_setzero_ps(), maxFriction ), simde_mm256_min_ps( newImpulse, maxFriction ) ); - b2FloatW impulse = sub( newImpulse, c->tangentImpulse2 ); + b2FloatW maxFriction = b2MulW( c->friction, c->normalImpulse2 ); + b2FloatW newImpulse = b2SubW( c->tangentImpulse2, negImpulse ); + newImpulse = b2MaxW( b2SubW( b2ZeroW(), maxFriction ), b2MinW( newImpulse, maxFriction ) ); + b2FloatW impulse = b2SubW( newImpulse, c->tangentImpulse2 ); c->tangentImpulse2 = newImpulse; // Apply contact impulse - b2FloatW Px = mul( impulse, tangentX ); - b2FloatW Py = mul( impulse, tangentY ); + b2FloatW Px = b2MulW( impulse, tangentX ); + b2FloatW Py = b2MulW( impulse, tangentY ); - bA.v.X = mulsub( bA.v.X, c->invMassA, Px ); - bA.v.Y = mulsub( bA.v.Y, c->invMassA, Py ); - bA.w = mulsub( bA.w, c->invIA, sub( mul( rA.X, Py ), mul( rA.Y, Px ) ) ); + bA.v.X = b2MulSubW( bA.v.X, c->invMassA, Px ); + bA.v.Y = b2MulSubW( bA.v.Y, c->invMassA, Py ); + bA.w = b2MulSubW( bA.w, c->invIA, b2SubW( b2MulW( rA.X, Py ), b2MulW( rA.Y, Px ) ) ); - bB.v.X = muladd( bB.v.X, c->invMassB, Px ); - bB.v.Y = muladd( bB.v.Y, c->invMassB, Py ); - bB.w = muladd( bB.w, c->invIB, sub( mul( rB.X, Py ), mul( rB.Y, Px ) ) ); + bB.v.X = b2MulAddW( bB.v.X, c->invMassB, Px ); + bB.v.Y = b2MulAddW( bB.v.Y, c->invMassB, Py ); + bB.w = b2MulAddW( bB.w, c->invIB, b2SubW( b2MulW( rB.X, Py ), b2MulW( rB.Y, Px ) ) ); } b2ScatterBodies( states, c->indexA, &bA ); @@ -1080,8 +1767,8 @@ void b2ApplyRestitutionTask( int startIndex, int endIndex, b2StepContext* contex b2BodyState* states = context->states; b2ContactConstraintSIMD* constraints = context->graph->colors[colorIndex].simdConstraints; - b2FloatW threshold = simde_mm256_set1_ps( context->world->restitutionThreshold ); - b2FloatW zero = simde_mm256_setzero_ps(); + b2FloatW threshold = b2SplatW( context->world->restitutionThreshold ); + b2FloatW zero = b2ZeroW(); for ( int i = startIndex; i < endIndex; ++i ) { @@ -1093,81 +1780,77 @@ void b2ApplyRestitutionTask( int startIndex, int endIndex, b2StepContext* contex // first point non-penetration constraint { // Set effective mass to zero if restitution should not be applied - b2FloatW test1 = simde_mm256_cmp_ps( add( c->relativeVelocity1, threshold ), zero, SIMDE_CMP_GT_OQ ); - b2FloatW test2 = simde_mm256_cmp_ps( c->maxNormalImpulse1, zero, SIMDE_CMP_EQ_OQ ); - b2FloatW test = simde_mm256_or_ps( test1, test2 ); - - // todo slow on SSE2 - b2FloatW mass = simde_mm256_blendv_ps( c->normalMass1, zero, test ); + b2FloatW mask1 = b2GreaterThanW( b2AddW( c->relativeVelocity1, threshold ), zero ); + b2FloatW mask2 = b2EqualsW( c->maxNormalImpulse1, zero ); + b2FloatW mask = b2OrW( mask1, mask2 ); + b2FloatW mass = b2BlendW( c->normalMass1, zero, mask ); // fixed anchors for Jacobians b2Vec2W rA = c->anchorA1; b2Vec2W rB = c->anchorB1; // Relative velocity at contact - b2FloatW dvx = sub( sub( bB.v.X, mul( bB.w, rB.Y ) ), sub( bA.v.X, mul( bA.w, rA.Y ) ) ); - b2FloatW dvy = sub( add( bB.v.Y, mul( bB.w, rB.X ) ), add( bA.v.Y, mul( bA.w, rA.X ) ) ); - b2FloatW vn = add( mul( dvx, c->normal.X ), mul( dvy, c->normal.Y ) ); + b2FloatW dvx = b2SubW( b2SubW( bB.v.X, b2MulW( bB.w, rB.Y ) ), b2SubW( bA.v.X, b2MulW( bA.w, rA.Y ) ) ); + b2FloatW dvy = b2SubW( b2AddW( bB.v.Y, b2MulW( bB.w, rB.X ) ), b2AddW( bA.v.Y, b2MulW( bA.w, rA.X ) ) ); + b2FloatW vn = b2AddW( b2MulW( dvx, c->normal.X ), b2MulW( dvy, c->normal.Y ) ); // Compute normal impulse - b2FloatW negImpulse = mul( mass, add( vn, mul( c->restitution, c->relativeVelocity1 ) ) ); + b2FloatW negImpulse = b2MulW( mass, b2AddW( vn, b2MulW( c->restitution, c->relativeVelocity1 ) ) ); // Clamp the accumulated impulse - b2FloatW newImpulse = simde_mm256_max_ps( sub( c->normalImpulse1, negImpulse ), simde_mm256_setzero_ps() ); - b2FloatW impulse = sub( newImpulse, c->normalImpulse1 ); + b2FloatW newImpulse = b2MaxW( b2SubW( c->normalImpulse1, negImpulse ), b2ZeroW() ); + b2FloatW impulse = b2SubW( newImpulse, c->normalImpulse1 ); c->normalImpulse1 = newImpulse; // Apply contact impulse - b2FloatW Px = mul( impulse, c->normal.X ); - b2FloatW Py = mul( impulse, c->normal.Y ); + b2FloatW Px = b2MulW( impulse, c->normal.X ); + b2FloatW Py = b2MulW( impulse, c->normal.Y ); - bA.v.X = mulsub( bA.v.X, c->invMassA, Px ); - bA.v.Y = mulsub( bA.v.Y, c->invMassA, Py ); - bA.w = mulsub( bA.w, c->invIA, sub( mul( rA.X, Py ), mul( rA.Y, Px ) ) ); + bA.v.X = b2MulSubW( bA.v.X, c->invMassA, Px ); + bA.v.Y = b2MulSubW( bA.v.Y, c->invMassA, Py ); + bA.w = b2MulSubW( bA.w, c->invIA, b2SubW( b2MulW( rA.X, Py ), b2MulW( rA.Y, Px ) ) ); - bB.v.X = muladd( bB.v.X, c->invMassB, Px ); - bB.v.Y = muladd( bB.v.Y, c->invMassB, Py ); - bB.w = muladd( bB.w, c->invIB, sub( mul( rB.X, Py ), mul( rB.Y, Px ) ) ); + bB.v.X = b2MulAddW( bB.v.X, c->invMassB, Px ); + bB.v.Y = b2MulAddW( bB.v.Y, c->invMassB, Py ); + bB.w = b2MulAddW( bB.w, c->invIB, b2SubW( b2MulW( rB.X, Py ), b2MulW( rB.Y, Px ) ) ); } // second point non-penetration constraint { // Set effective mass to zero if restitution should not be applied - b2FloatW test1 = simde_mm256_cmp_ps( add( c->relativeVelocity2, threshold ), zero, SIMDE_CMP_GT_OQ ); - b2FloatW test2 = simde_mm256_cmp_ps( c->maxNormalImpulse2, zero, SIMDE_CMP_EQ_OQ ); - b2FloatW test = simde_mm256_or_ps( test1, test2 ); - - // todo slow on SSE2 - b2FloatW mass = simde_mm256_blendv_ps( c->normalMass2, zero, test ); + b2FloatW mask1 = b2GreaterThanW( b2AddW( c->relativeVelocity2, threshold ), zero ); + b2FloatW mask2 = b2EqualsW( c->maxNormalImpulse2, zero ); + b2FloatW mask = b2OrW( mask1, mask2 ); + b2FloatW mass = b2BlendW( c->normalMass2, zero, mask ); // fixed anchors for Jacobians b2Vec2W rA = c->anchorA2; b2Vec2W rB = c->anchorB2; // Relative velocity at contact - b2FloatW dvx = sub( sub( bB.v.X, mul( bB.w, rB.Y ) ), sub( bA.v.X, mul( bA.w, rA.Y ) ) ); - b2FloatW dvy = sub( add( bB.v.Y, mul( bB.w, rB.X ) ), add( bA.v.Y, mul( bA.w, rA.X ) ) ); - b2FloatW vn = add( mul( dvx, c->normal.X ), mul( dvy, c->normal.Y ) ); + b2FloatW dvx = b2SubW( b2SubW( bB.v.X, b2MulW( bB.w, rB.Y ) ), b2SubW( bA.v.X, b2MulW( bA.w, rA.Y ) ) ); + b2FloatW dvy = b2SubW( b2AddW( bB.v.Y, b2MulW( bB.w, rB.X ) ), b2AddW( bA.v.Y, b2MulW( bA.w, rA.X ) ) ); + b2FloatW vn = b2AddW( b2MulW( dvx, c->normal.X ), b2MulW( dvy, c->normal.Y ) ); // Compute normal impulse - b2FloatW negImpulse = mul( mass, add( vn, mul( c->restitution, c->relativeVelocity2 ) ) ); + b2FloatW negImpulse = b2MulW( mass, b2AddW( vn, b2MulW( c->restitution, c->relativeVelocity2 ) ) ); // Clamp the accumulated impulse - b2FloatW newImpulse = simde_mm256_max_ps( sub( c->normalImpulse2, negImpulse ), simde_mm256_setzero_ps() ); - b2FloatW impulse = sub( newImpulse, c->normalImpulse2 ); + b2FloatW newImpulse = b2MaxW( b2SubW( c->normalImpulse2, negImpulse ), b2ZeroW() ); + b2FloatW impulse = b2SubW( newImpulse, c->normalImpulse2 ); c->normalImpulse2 = newImpulse; // Apply contact impulse - b2FloatW Px = mul( impulse, c->normal.X ); - b2FloatW Py = mul( impulse, c->normal.Y ); + b2FloatW Px = b2MulW( impulse, c->normal.X ); + b2FloatW Py = b2MulW( impulse, c->normal.Y ); - bA.v.X = mulsub( bA.v.X, c->invMassA, Px ); - bA.v.Y = mulsub( bA.v.Y, c->invMassA, Py ); - bA.w = mulsub( bA.w, c->invIA, sub( mul( rA.X, Py ), mul( rA.Y, Px ) ) ); + bA.v.X = b2MulSubW( bA.v.X, c->invMassA, Px ); + bA.v.Y = b2MulSubW( bA.v.Y, c->invMassA, Py ); + bA.w = b2MulSubW( bA.w, c->invIA, b2SubW( b2MulW( rA.X, Py ), b2MulW( rA.Y, Px ) ) ); - bB.v.X = muladd( bB.v.X, c->invMassB, Px ); - bB.v.Y = muladd( bB.v.Y, c->invMassB, Py ); - bB.w = muladd( bB.w, c->invIB, sub( mul( rB.X, Py ), mul( rB.Y, Px ) ) ); + bB.v.X = b2MulAddW( bB.v.X, c->invMassB, Px ); + bB.v.Y = b2MulAddW( bB.v.Y, c->invMassB, Py ); + bB.w = b2MulAddW( bB.w, c->invIB, b2SubW( b2MulW( rB.X, Py ), b2MulW( rB.Y, Px ) ) ); } b2ScatterBodies( states, c->indexA, &bA ); @@ -1177,6 +1860,8 @@ void b2ApplyRestitutionTask( int startIndex, int endIndex, b2StepContext* contex b2TracyCZoneEnd( restitution ); } +#if B2_SIMD_WIDTH == 8 + void b2StoreImpulsesTask( int startIndex, int endIndex, b2StepContext* context ) { b2TracyCZoneNC( store_impulses, "Store", b2_colorFirebrick, true ); @@ -1291,3 +1976,78 @@ void b2StoreImpulsesTask( int startIndex, int endIndex, b2StepContext* context ) b2TracyCZoneEnd( store_impulses ); } + +#else + +void b2StoreImpulsesTask( int startIndex, int endIndex, b2StepContext* context ) +{ + b2TracyCZoneNC( store_impulses, "Store", b2_colorFirebrick, true ); + + b2ContactSim** contacts = context->contacts; + const b2ContactConstraintSIMD* constraints = context->simdContactConstraints; + + b2Manifold dummy = { 0 }; + + for ( int i = startIndex; i < endIndex; ++i ) + { + const b2ContactConstraintSIMD* c = constraints + i; + const float* normalImpulse1 = (float*)&c->normalImpulse1; + const float* normalImpulse2 = (float*)&c->normalImpulse2; + const float* tangentImpulse1 = (float*)&c->tangentImpulse1; + const float* tangentImpulse2 = (float*)&c->tangentImpulse2; + const float* maxNormalImpulse1 = (float*)&c->maxNormalImpulse1; + const float* maxNormalImpulse2 = (float*)&c->maxNormalImpulse2; + const float* normalVelocity1 = (float*)&c->relativeVelocity1; + const float* normalVelocity2 = (float*)&c->relativeVelocity2; + + int base = 4 * i; + b2Manifold* m0 = contacts[base + 0] == NULL ? &dummy : &contacts[base + 0]->manifold; + b2Manifold* m1 = contacts[base + 1] == NULL ? &dummy : &contacts[base + 1]->manifold; + b2Manifold* m2 = contacts[base + 2] == NULL ? &dummy : &contacts[base + 2]->manifold; + b2Manifold* m3 = contacts[base + 3] == NULL ? &dummy : &contacts[base + 3]->manifold; + + m0->points[0].normalImpulse = normalImpulse1[0]; + m0->points[0].tangentImpulse = tangentImpulse1[0]; + m0->points[0].maxNormalImpulse = maxNormalImpulse1[0]; + m0->points[0].normalVelocity = normalVelocity1[0]; + + m0->points[1].normalImpulse = normalImpulse2[0]; + m0->points[1].tangentImpulse = tangentImpulse2[0]; + m0->points[1].maxNormalImpulse = maxNormalImpulse2[0]; + m0->points[1].normalVelocity = normalVelocity2[0]; + + m1->points[0].normalImpulse = normalImpulse1[1]; + m1->points[0].tangentImpulse = tangentImpulse1[1]; + m1->points[0].maxNormalImpulse = maxNormalImpulse1[1]; + m1->points[0].normalVelocity = normalVelocity1[1]; + + m1->points[1].normalImpulse = normalImpulse2[1]; + m1->points[1].tangentImpulse = tangentImpulse2[1]; + m1->points[1].maxNormalImpulse = maxNormalImpulse2[1]; + m1->points[1].normalVelocity = normalVelocity2[1]; + + m2->points[0].normalImpulse = normalImpulse1[2]; + m2->points[0].tangentImpulse = tangentImpulse1[2]; + m2->points[0].maxNormalImpulse = maxNormalImpulse1[2]; + m2->points[0].normalVelocity = normalVelocity1[2]; + + m2->points[1].normalImpulse = normalImpulse2[2]; + m2->points[1].tangentImpulse = tangentImpulse2[2]; + m2->points[1].maxNormalImpulse = maxNormalImpulse2[2]; + m2->points[1].normalVelocity = normalVelocity2[2]; + + m3->points[0].normalImpulse = normalImpulse1[3]; + m3->points[0].tangentImpulse = tangentImpulse1[3]; + m3->points[0].maxNormalImpulse = maxNormalImpulse1[3]; + m3->points[0].normalVelocity = normalVelocity1[3]; + + m3->points[1].normalImpulse = normalImpulse2[3]; + m3->points[1].tangentImpulse = tangentImpulse2[3]; + m3->points[1].maxNormalImpulse = maxNormalImpulse2[3]; + m3->points[1].normalVelocity = normalVelocity2[3]; + } + + b2TracyCZoneEnd( store_impulses ); +} + +#endif diff --git a/src/contact_solver.h b/src/contact_solver.h index e265e93f0..911e6a9d8 100644 --- a/src/contact_solver.h +++ b/src/contact_solver.h @@ -4,7 +4,6 @@ #pragma once #include "solver.h" -#include "x86/avx.h" typedef struct b2ContactSim b2ContactSim; @@ -34,48 +33,7 @@ typedef struct b2ContactConstraint int pointCount; } b2ContactConstraint; -// Wide float -typedef simde__m256 b2FloatW; - -// Wide vec2 -typedef struct b2Vec2W -{ - b2FloatW X, Y; -} b2Vec2W; - -// Wide rotation -typedef struct b2RotW -{ - b2FloatW S, C; -} b2RotW; - -typedef struct b2ContactConstraintSIMD -{ - int indexA[8]; - int indexB[8]; - - b2FloatW invMassA, invMassB; - b2FloatW invIA, invIB; - b2Vec2W normal; - b2FloatW friction; - b2FloatW biasRate; - b2FloatW massScale; - b2FloatW impulseScale; - b2Vec2W anchorA1, anchorB1; - b2FloatW normalMass1, tangentMass1; - b2FloatW baseSeparation1; - b2FloatW normalImpulse1; - b2FloatW maxNormalImpulse1; - b2FloatW tangentImpulse1; - b2Vec2W anchorA2, anchorB2; - b2FloatW baseSeparation2; - b2FloatW normalImpulse2; - b2FloatW maxNormalImpulse2; - b2FloatW tangentImpulse2; - b2FloatW normalMass2, tangentMass2; - b2FloatW restitution; - b2FloatW relativeVelocity1, relativeVelocity2; -} b2ContactConstraintSIMD; +int b2GetContactConstraintSIMDByteCount( void ); // Overflow contacts don't fit into the constraint graph coloring void b2PrepareOverflowContacts( b2StepContext* context ); diff --git a/src/core.c b/src/core.c index 24fea5b42..c18ac2047 100644 --- a/src/core.c +++ b/src/core.c @@ -38,5 +38,5 @@ void b2SetAssertFcn( b2AssertFcn* assertFcn ) b2Version b2GetVersion( void ) { - return ( b2Version ){ 3, 0, 0 }; + return ( b2Version ){ 3, 0, 1 }; } diff --git a/src/core.h b/src/core.h index fe495c6a4..4e6aa27a4 100644 --- a/src/core.h +++ b/src/core.h @@ -8,103 +8,137 @@ #define B2_NULL_INDEX ( -1 ) #ifdef NDEBUG - #define B2_DEBUG 0 +#define B2_DEBUG 0 #else - #define B2_DEBUG 1 +#define B2_DEBUG 1 #endif #if defined( BOX2D_VALIDATE ) && !defined( NDEBUG ) - #define B2_VALIDATE 1 +#define B2_VALIDATE 1 #else - #define B2_VALIDATE 0 +#define B2_VALIDATE 0 #endif // Define platform #if defined( _WIN64 ) - #define B2_PLATFORM_WINDOWS +#define B2_PLATFORM_WINDOWS #elif defined( __ANDROID__ ) - #define B2_PLATFORM_ANDROID +#define B2_PLATFORM_ANDROID #elif defined( __linux__ ) - #define B2_PLATFORM_LINUX +#define B2_PLATFORM_LINUX #elif defined( __APPLE__ ) - #include - #if defined( TARGET_OS_IPHONE ) && !TARGET_OS_IPHONE - #define B2_PLATFORM_MACOS - #else - #define B2_PLATFORM_IOS - #endif +#include +#if defined( TARGET_OS_IPHONE ) && !TARGET_OS_IPHONE +#define B2_PLATFORM_MACOS +#else +#define B2_PLATFORM_IOS +#endif #elif defined( __EMSCRIPTEN__ ) - #define B2_PLATFORM_WASM +#define B2_PLATFORM_WASM #else - #error Unsupported platform +#error Unsupported platform #endif // Define CPU #if defined( __x86_64__ ) || defined( _M_X64 ) - #define B2_CPU_X64 +#define B2_CPU_X64 #elif defined( __aarch64__ ) || defined( _M_ARM64 ) - #define B2_CPU_ARM +#define B2_CPU_ARM #elif defined( __EMSCRIPTEN__ ) - #define B2_CPU_WASM +#define B2_CPU_WASM +#else +#error Unsupported CPU +#endif + +// Define SIMD +#if defined( BOX2D_ENABLE_SIMD ) + +#if defined( B2_CPU_X64 ) + +#if defined( BOX2D_AVX2 ) +#define B2_SIMD_AVX2 +#define B2_SIMD_WIDTH 8 #else - #error Unsupported CPU +#define B2_SIMD_SSE2 +#define B2_SIMD_WIDTH 4 +#endif + +#elif defined( B2_CPU_ARM ) + +#define B2_SIMD_NEON +#define B2_SIMD_WIDTH 4 + +#elif defined( __EMSCRIPTEN__ ) + +#define B2_CPU_WASM +#define B2_SIMD_SSE2 +#define B2_SIMD_WIDTH 4 + +#else + +#define B2_SIMD_NONE +#define B2_SIMD_WIDTH 4 + +#endif + +#else + +#define B2_SIMD_NONE +#define B2_SIMD_WIDTH 4 + #endif // Define compiler #if defined( __clang__ ) - #define B2_COMPILER_CLANG +#define B2_COMPILER_CLANG #elif defined( __GNUC__ ) - #define B2_COMPILER_GCC +#define B2_COMPILER_GCC #elif defined( _MSC_VER ) - #define B2_COMPILER_MSVC +#define B2_COMPILER_MSVC #endif #if defined( B2_COMPILER_MSVC ) - #define B2_BREAKPOINT __debugbreak() +#define B2_BREAKPOINT __debugbreak() #elif defined( B2_PLATFORM_WASM ) - #define B2_BREAKPOINT \ - do \ - { \ - } \ - while ( 0 ) +#define B2_BREAKPOINT \ + do \ + { \ + } \ + while ( 0 ) #elif defined( B2_COMPILER_GCC ) || defined( B2_COMPILER_CLANG ) - #if defined( B2_CPU_X64 ) - #define B2_BREAKPOINT __asm volatile( "int $0x3" ) - #elif defined( B2_CPU_ARM ) - #define B2_BREAKPOINT __builtin_trap() - #endif +#if defined( B2_CPU_X64 ) +#define B2_BREAKPOINT __asm volatile( "int $0x3" ) +#elif defined( B2_CPU_ARM ) +#define B2_BREAKPOINT __builtin_trap() +#endif #else - #error Unknown platform +#error Unknown platform #endif #if !defined( NDEBUG ) || defined( B2_ENABLE_ASSERT ) extern b2AssertFcn* b2AssertHandler; - #define B2_ASSERT( condition ) \ - do \ - { \ - if ( !( condition ) && b2AssertHandler( #condition, __FILE__, (int)__LINE__ ) ) \ - B2_BREAKPOINT; \ - } \ - while ( 0 ) +#define B2_ASSERT( condition ) \ + do \ + { \ + if ( !( condition ) && b2AssertHandler( #condition, __FILE__, (int)__LINE__ ) ) \ + B2_BREAKPOINT; \ + } \ + while ( 0 ) #else - #define B2_ASSERT( ... ) ( (void)0 ) +#define B2_ASSERT( ... ) ( (void)0 ) #endif /// Tracy profiler instrumentation /// https://github.com/wolfpld/tracy #ifdef BOX2D_PROFILE - - #include - #define b2TracyCZoneC( ctx, color, active ) TracyCZoneC( ctx, color, active ) - #define b2TracyCZoneNC( ctx, name, color, active ) TracyCZoneNC( ctx, name, color, active ) - #define b2TracyCZoneEnd( ctx ) TracyCZoneEnd( ctx ) - +#include +#define b2TracyCZoneC( ctx, color, active ) TracyCZoneC( ctx, color, active ) +#define b2TracyCZoneNC( ctx, name, color, active ) TracyCZoneNC( ctx, name, color, active ) +#define b2TracyCZoneEnd( ctx ) TracyCZoneEnd( ctx ) #else - - #define b2TracyCZoneC( ctx, color, active ) - #define b2TracyCZoneNC( ctx, name, color, active ) - #define b2TracyCZoneEnd( ctx ) - +#define b2TracyCZoneC( ctx, color, active ) +#define b2TracyCZoneNC( ctx, name, color, active ) +#define b2TracyCZoneEnd( ctx ) #endif extern float b2_lengthUnitsPerMeter; diff --git a/src/solver.c b/src/solver.c index d83a471d6..e5e087ad7 100644 --- a/src/solver.c +++ b/src/solver.c @@ -16,13 +16,23 @@ #include "stack_allocator.h" #include "world.h" -// for mm_pause -#include "x86/sse2.h" - #include #include #include +#if defined(B2_CPU_ARM) +static inline void b2Pause (void) +{ + __asm__ __volatile__("isb\n"); +} +#else +#include +static inline void b2Pause(void) +{ + _mm_pause(); +} +#endif + typedef struct b2WorkerContext { b2StepContext* context; @@ -548,7 +558,7 @@ static void b2ExecuteMainStage( b2SolverStage* stage, b2StepContext* context, ui // todo consider using the cycle counter as well while ( atomic_load( &stage->completionCount ) != blockCount ) { - simde_mm_pause(); + b2Pause(); } atomic_store( &stage->completionCount, 0 ); @@ -753,12 +763,12 @@ void b2SolverTask( int startIndex, int endIndex, uint32_t threadIndexDontUse, vo // uint64_t prev = __rdtsc(); // do //{ - // simde_mm_pause(); + // b2Pause(); //} // while ((__rdtsc() - prev) < maxSpinTime); // maxSpinTime += 10; - simde_mm_pause(); - simde_mm_pause(); + b2Pause(); + b2Pause(); spinCount += 1; } } @@ -1104,6 +1114,12 @@ static void b2BulletBodyTask( int startIndex, int endIndex, uint32_t threadIndex b2TracyCZoneEnd( bullet_body_task ); } +#if B2_SIMD_WIDTH == 8 +#define B2_SIMD_SHIFT 3 +#else +#define B2_SIMD_SHIFT 2 +#endif + // Solve with graph coloring void b2Solve( b2World* world, b2StepContext* stepContext ) { @@ -1224,8 +1240,8 @@ void b2Solve( b2World* world, b2StepContext* stepContext ) { activeColorIndices[c] = i; - // 8-way SIMD - int colorContactCountSIMD = colorContactCount > 0 ? ( ( colorContactCount - 1 ) >> 3 ) + 1 : 0; + // 4/8-way SIMD + int colorContactCountSIMD = colorContactCount > 0 ? ( ( colorContactCount - 1 ) >> B2_SIMD_SHIFT ) + 1 : 0; colorContactCounts[c] = colorContactCountSIMD; @@ -1279,14 +1295,15 @@ void b2Solve( b2World* world, b2StepContext* stepContext ) // Gather contact pointers for easy parallel-for traversal. Some may be NULL due to SIMD remainders. b2ContactSim** contacts = - b2AllocateStackItem( &world->stackAllocator, 8 * simdContactCount * sizeof( b2ContactSim* ), "contact pointers" ); + b2AllocateStackItem( &world->stackAllocator, B2_SIMD_WIDTH * simdContactCount * sizeof( b2ContactSim* ), "contact pointers" ); // Gather joint pointers for easy parallel-for traversal. b2JointSim** joints = b2AllocateStackItem( &world->stackAllocator, awakeJointCount * sizeof( b2JointSim* ), "joint pointers" ); - b2ContactConstraintSIMD* simdContactConstraints = b2AllocateStackItem( - &world->stackAllocator, simdContactCount * sizeof( b2ContactConstraintSIMD ), "contact constraint" ); + int simdConstraintSize = b2GetContactConstraintSIMDByteCount(); + b2ContactConstraintSIMD* simdContactConstraints = + b2AllocateStackItem( &world->stackAllocator, simdContactCount * simdConstraintSize, "contact constraint" ); int overflowContactCount = colors[b2_overflowIndex].contacts.count; b2ContactConstraint* overflowContactConstraints = b2AllocateStackItem( @@ -1311,18 +1328,18 @@ void b2Solve( b2World* world, b2StepContext* stepContext ) } else { - color->simdConstraints = simdContactConstraints + contactBase; + color->simdConstraints = (b2ContactConstraintSIMD*)((uint8_t*)simdContactConstraints + contactBase * simdConstraintSize); for ( int k = 0; k < colorContactCount; ++k ) { - contacts[8 * contactBase + k] = color->contacts.data + k; + contacts[B2_SIMD_WIDTH * contactBase + k] = color->contacts.data + k; } // remainder - int colorContactCountSIMD = ( ( colorContactCount - 1 ) >> 3 ) + 1; - for ( int k = colorContactCount; k < 8 * colorContactCountSIMD; ++k ) + int colorContactCountSIMD = ( ( colorContactCount - 1 ) >> B2_SIMD_SHIFT ) + 1; + for ( int k = colorContactCount; k < B2_SIMD_WIDTH * colorContactCountSIMD; ++k ) { - contacts[8 * contactBase + k] = NULL; + contacts[B2_SIMD_WIDTH * contactBase + k] = NULL; } contactBase += colorContactCountSIMD; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4afdc961a..7f7acdbef 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -26,6 +26,6 @@ if(MSVC) target_compile_options(test PRIVATE /experimental:c11atomics) endif() -target_link_libraries(test PRIVATE box2d enkiTS simde) +target_link_libraries(test PRIVATE box2d enkiTS) source_group(TREE "${CMAKE_CURRENT_SOURCE_DIR}" PREFIX "" FILES ${BOX2D_TESTS})