From e92cd8667231f25d88d233ceca04a64b063e33f3 Mon Sep 17 00:00:00 2001 From: anutosh491 Date: Mon, 9 Oct 2023 10:56:14 +0530 Subject: [PATCH 01/10] Initial Implementation for the new WASM based instruction set --- include/xsimd/arch/xsimd_isa.hpp | 4 + include/xsimd/arch/xsimd_wasm.hpp | 541 ++++++++++++++++++++ include/xsimd/config/xsimd_arch.hpp | 3 +- include/xsimd/config/xsimd_config.hpp | 14 +- include/xsimd/types/xsimd_all_registers.hpp | 2 + include/xsimd/types/xsimd_wasm_register.hpp | 61 +++ test/test_wasm/test_wasm.cpp | 31 ++ 7 files changed, 653 insertions(+), 3 deletions(-) create mode 100644 include/xsimd/arch/xsimd_wasm.hpp create mode 100644 include/xsimd/types/xsimd_wasm_register.hpp create mode 100644 test/test_wasm/test_wasm.cpp diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index cf0f796a1..8f05a5dab 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -80,6 +80,10 @@ #include "./xsimd_sve.hpp" #endif +#if XSIMD_WITH_WASM +#include "./xsimd_wasm.hpp" +#endif + // Must come last to have access to all conversion specializations. #include "./xsimd_generic.hpp" diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp new file mode 100644 index 000000000..63d77ab19 --- /dev/null +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -0,0 +1,541 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Anutosh Bhat * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_WASM_HPP +#define XSIMD_WASM_HPP + +#include + +#include "../types/xsimd_wasm_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // abs + template ::value && std::is_signed::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_abs(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_abs(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_abs(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_abs(self); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + template + inline batch abs(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_abs(self, other); + } + + template + inline batch abs(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_abs(self, other); + } + + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_add(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_add(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_add(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_add(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_add(self, other); + } + + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_add(self, other); + } + + // all + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return wasm_i32x4_bitmask(self) == 0x0F; + } + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return wasm_i64x2_bitmask(self) == 0x03; + } + template ::value, void>::type> + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return wasm_i8x16_bitmask(self) == 0xFFFF; + } + + // any + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return wasm_i32x4_bitmask(self) != 0; + } + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return wasm_i64x2_bitmask(self) != 0; + } + template ::value, void>::type> + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return wasm_i8x16_bitmask(self) != 0; + } + + // bitwise_and + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_v128_and(self, other); + } + + template + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return wasm_v128_and(self, other); + } + + // bitwise_andnot + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_v128_andnot(self, other); + } + + template + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return wasm_v128_andnot(self, other); + } + + // bitwise_or + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_v128_or(self, other); + } + + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return wasm_v128_or(self, other); + } + + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_v128_and(wasm_i8x16_splat(0xFF << other), wasm_i32x4_shl(self, other)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_shl(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_shl(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_shl(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // bitwise_xor + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_v128_xor(self, other); + } + + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return wasm_v128_xor(self, other); + } + + // broadcast + template + batch inline broadcast(float val, requires_arch) noexcept + { + return wasm_f32x4_splat(val); + } + template ::value, void>::type> + inline batch broadcast(T val, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_splat(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_splat(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_splat(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_splat(val); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch broadcast(double val, requires_arch) noexcept + { + return wasm_f64x2_splat(val); + } + + // div + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_div(self, other); + } + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_div(self, other); + } + + // ge + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_ge(self, other); + } + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_ge(self, other); + } + + // le + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_le(self, other); + } + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_le(self, other); + } + + // load_aligned + template + inline batch load_aligned(float const* mem, convert, requires_arch) noexcept + { + // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem. + return wasm_v128_load(mem); + } + template ::value, void>::type> + inline batch load_aligned(T const* mem, convert, requires_arch) noexcept + { + // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem. + return wasm_v128_load((v128_t const*)mem); + } + template + inline batch load_aligned(double const* mem, convert, requires_arch) noexcept + { + // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem. + return wasm_v128_load(mem); + } + + // load_unaligned + template + inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept + { + return wasm_v128_load(mem); + } + template ::value, void>::type> + inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return wasm_v128_load((v128_t const*)mem); + } + template + inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept + { + return wasm_v128_load(mem); + } + + // max + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_pmax(self, other); + } + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self > other, self, other); + } + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_pmax(self, other); + } + + // min + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_pmin(self, other); + } + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self <= other, self, other); + } + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_pmin(self, other); + } + + // mul + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_mul(self, other); + } + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_mul(self, other); + } + + // select + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br)); + } + + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br)); + } + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, wasm {}); + } + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br)); + } + + // set + template + inline batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return wasm_f32x4_make(values...); + } + + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1) noexcept + { + return wasm_i64x2_make(v0, v1); + } + + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept + { + return wasm_i32x4_make(v0, v1, v2, v3); + } + + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept + { + return wasm_i16x8_make(v0, v1, v2, v3, v4, v5, v6, v7); + } + + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept + { + return wasm_i8x16_make(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + + template + inline batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return wasm_f64x2_make(values...); + } + + template ::value, void>::type> + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; + } + + // store_aligned + template + inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept + { + // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. + return wasm_v128_store(mem, self); + } + template ::value, void>::type> + inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept + { + // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. + return wasm_v128_store((v128_t*)mem, self); + } + template ::value, void>::type> + inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. + return wasm_v128_store((v128_t*)mem, self); + } + template + inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept + { + // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. + return wasm_v128_store(mem, self); + } + + // store_unaligned + template + inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept + { + return wasm_v128_store(mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept + { + return wasm_v128_store((v128_t*)mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return wasm_v128_store((v128_t*)mem, self); + } + template + inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept + { + return wasm_v128_store(mem, self); + } + + // sub + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_sub(self, other); + } + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_sub(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_sub(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_sub(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_sub(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_sub(self, other); + } + + // sqrt + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return wasm_f32x4_sqrt(val); + } + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return wasm_f64x2_sqrt(val); + } + } +} + +#endif \ No newline at end of file diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp index 81cada583..ab9ecbc29 100644 --- a/include/xsimd/config/xsimd_arch.hpp +++ b/include/xsimd/config/xsimd_arch.hpp @@ -193,7 +193,8 @@ namespace xsimd using all_x86_architectures = arch_list, avx2, fma3, avx, fma4, fma3, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>; using all_sve_architectures = arch_list, detail::sve<256>, detail::sve<128>>; using all_arm_architectures = typename detail::join>::type; - using all_architectures = typename detail::join::type; + using all_wasm_architectures = arch_list; + using all_architectures = typename detail::join::type; using supported_architectures = typename detail::supported::type; diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp index d36ed1ee5..27824f3ae 100644 --- a/include/xsimd/config/xsimd_config.hpp +++ b/include/xsimd/config/xsimd_config.hpp @@ -285,6 +285,17 @@ #define XSIMD_SVE_BITS 0 #endif +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if WebAssembly SIMD is available at compile-time, to 0 otherwise. + */ +#ifdef __EMSCRIPTEN__ +#define XSIMD_WITH_WASM 1 +#else +#define XSIMD_WITH_WASM 0 +#endif + // Workaround for MSVC compiler #ifdef _MSC_VER @@ -343,8 +354,7 @@ #endif -#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE -#define XSIMD_NO_SUPPORTED_ARCHITECTURE +#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_WASM #endif #endif diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp index 1fe077325..ec20ce5fb 100644 --- a/include/xsimd/types/xsimd_all_registers.hpp +++ b/include/xsimd/types/xsimd_all_registers.hpp @@ -30,3 +30,5 @@ #include "xsimd_neon_register.hpp" #include "xsimd_sve_register.hpp" + +#include "xsimd_wasm_register.hpp" diff --git a/include/xsimd/types/xsimd_wasm_register.hpp b/include/xsimd/types/xsimd_wasm_register.hpp new file mode 100644 index 000000000..ab8782ac6 --- /dev/null +++ b/include/xsimd/types/xsimd_wasm_register.hpp @@ -0,0 +1,61 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Anutosh Bhat * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_WASM_REGISTER_HPP +#define XSIMD_WASM_REGISTER_HPP + +#include "xsimd_generic_arch.hpp" +#include "xsimd_register.hpp" + +#if XSIMD_WITH_WASM +#include +#endif + +namespace xsimd +{ + /** + * @ingroup architectures + * + * WASM instructions + */ + struct wasm : generic + { + static constexpr bool supported() noexcept { return XSIMD_WITH_WASM; } + static constexpr bool available() noexcept { return true; } + static constexpr bool requires_alignment() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(10, 0, 0); } + static constexpr std::size_t alignment() noexcept { return 16; } + static constexpr char const* name() noexcept { return "wasm"; } + }; + +#if XSIMD_WITH_WASM + namespace types + { + XSIMD_DECLARE_SIMD_REGISTER(bool, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(signed char, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(unsigned char, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(char, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(unsigned short, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(short, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(unsigned int, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(int, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(long int, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(long long int, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(float, wasm, v128_t); + XSIMD_DECLARE_SIMD_REGISTER(double, wasm, v128_t); + } +#endif +} + +#endif diff --git a/test/test_wasm/test_wasm.cpp b/test/test_wasm/test_wasm.cpp new file mode 100644 index 000000000..65ee6f89c --- /dev/null +++ b/test/test_wasm/test_wasm.cpp @@ -0,0 +1,31 @@ +#include "xsimd/xsimd.hpp" +#include +#include + +#include +#include // for reporting errors + +using namespace emscripten; + +int test_abs() +{ + std::cout << "test_abs" << std::endl; + auto ans = xsimd::abs(a); + std::cout << ans << std::endl; + return 0; +} + +int run_tests() +{ + // todo add actual tests + if (auto ret = test_(); ret != 0) + { + return ret; + } + return 0; +} + +EMSCRIPTEN_BINDINGS(my_module) +{ + emscripten::function("run_tests", &run_tests); +} \ No newline at end of file From 6352536f8944e49ac6654431f467d368d2469e06 Mon Sep 17 00:00:00 2001 From: anutosh491 Date: Mon, 9 Oct 2023 12:57:46 +0530 Subject: [PATCH 02/10] Added the following operations through direct intrinsics 1) Bitwise: bitwise_rshift, bitwise_not, bitwise_and, bitwise_or, bitwise_lshift, bitwise_xor, bitwise_andnot 2) Logical: gt, lt, eq, neq, all, any, ge, le 3) Arithmetic: add, sub, mul, div, neg, reciprocal 4) Math: abs, sqrt, rsqrt, max, min 5) Roudning: floor, ceil , trunc 6) Memory: store_aligned, store_unaligned, load_aligned, load_unaligned, set 7) Complex: isnan 7) Misc: mask, select, broadcast, insert --- include/xsimd/arch/xsimd_wasm.hpp | 539 +++++++++++++++++++++++++++++- 1 file changed, 532 insertions(+), 7 deletions(-) diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp index 63d77ab19..a6f45ee10 100644 --- a/include/xsimd/arch/xsimd_wasm.hpp +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -52,15 +52,15 @@ namespace xsimd } template - inline batch abs(batch const& self, batch const& other, requires_arch) noexcept + inline batch abs(batch const& self, requires_arch) noexcept { - return wasm_f32x4_abs(self, other); + return wasm_f32x4_abs(self); } template - inline batch abs(batch const& self, batch const& other, requires_arch) noexcept + inline batch abs(batch const& self, requires_arch) noexcept { - return wasm_f64x2_abs(self, other); + return wasm_f64x2_abs(self); } // add @@ -202,6 +202,73 @@ namespace xsimd } } + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_shr(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_shr(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_shr(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_shr(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_v128_and(wasm_i8x16_splat(0xFF >> other), wasm_u32x4_shr(self, other)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_u16x8_shr(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_u32x4_shr(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_u64x2_shr(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } + + // bitwise_not + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return wasm_v128_not(self); + } + + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return wasm_v128_not(self); + } + // bitwise_xor template inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept @@ -252,6 +319,18 @@ namespace xsimd return wasm_f64x2_splat(val); } + // ceil + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return wasm_f32x4_ceil(self); + } + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return wasm_f64x2_ceil(self); + } + // div template inline batch div(batch const& self, batch const& other, requires_arch) noexcept @@ -264,6 +343,91 @@ namespace xsimd return wasm_f64x2_div(self, other); } + // eq + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_eq(self, other); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return wasm_f32x4_eq(self, other); + } + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_eq(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_eq(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_eq(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_eq(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template ::value, void>::type> + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_eq(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_eq(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_eq(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_eq(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_eq(self, other); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return wasm_f64x2_eq(self, other); + } + + // floor + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return wasm_f32x4_floor(self); + } + + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return wasm_f64x2_floor(self); + } + // ge template inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept @@ -276,6 +440,138 @@ namespace xsimd return wasm_f64x2_ge(self, other); } + // gt + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_gt(self, other); + } + template ::value, void>::type> + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_gt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_gt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_gt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_gt(self, other); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_u8x16_gt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_u16x8_gt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_u32x4_gt(self, other); + } + else + { + return gt(self, other, generic {}); + } + } + } + + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_gt(self, other); + } + + // insert + template + inline batch insert(batch const& self, float val, index pos, requires_arch) noexcept + { + return wasm_f32x4_replace_lane(self, pos, val); + } + template ::value, void>::type> + inline batch insert(batch const& self, T val, index pos, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_replace_lane(self, pos, val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_replace_lane(self, pos, val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_replace_lane(self, pos, val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_replace_lane(self, pos, val); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_u8x16_replace_lane(self, pos, val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_u16x8_replace_lane(self, pos, val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_u32x4_replace_lane(self, pos, val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_u64x2_replace_lane(self, pos, val); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } + + template + inline batch insert(batch const& self, double val, index pos, requires_arch) noexcept + { + return wasm_f64x2_replace_lane(self, pos, val); + } + + // isnan + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return wasm_v128_or(wasm_f32x4_ne(self, self), wasm_f32x4_ne(self, self)); + } + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return wasm_v128_or(wasm_f64x2_ne(self, self), wasm_f64x2_ne(self, self)); + } + // le template inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept @@ -325,6 +621,109 @@ namespace xsimd return wasm_v128_load(mem); } + // lt + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_lt(self, other); + } + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_lt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_lt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_lt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_lt(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_u8x16_lt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_u16x8_lt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_u32x4_lt(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return lt(self, other, generic {}); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } + + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_lt(self, other); + } + + // mask + template ::value, void>::type> + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_bitmask(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_bitmask(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_bitmask(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_bitmask(self); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return wasm_i32x4_bitmask(self); + } + + template + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return wasm_i64x2_bitmask(self); + } + // max template inline batch max(batch const& self, batch const& other, requires_arch) noexcept @@ -371,17 +770,117 @@ namespace xsimd return wasm_f64x2_mul(self, other); } + // neg + template ::value, void>::type> + inline batch neg(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_neg(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_neg(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_neg(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_neg(self); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return wasm_f32x4_neg(self); + } + + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return wasm_f64x2_neg(self); + } + + // neq + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f32x4_ne(self, other); + } + template ::value, void>::type> + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return wasm_f32x4_ne(self, other); + } + template ::value, void>::type> + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return ~(self == other); + } + + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_f64x2_ne(self, other); + } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return wasm_f64x2_ne(self, other); + } + + // reciprocal + template + inline batch reciprocal(batch const& self, requires_arch) noexcept + { + v128_t one = wasm_f32x4_splat(1.0f); + return wasm_f32x4_div(one, self); + } + template + inline batch reciprocal(batch const& self, requires_arch) noexcept + { + v128_t one = wasm_f64x2_splat(1.0); + return wasm_f64x2_div(one, self); + } + + // rsqrt + template + inline batch rsqrt(batch const& self, requires_arch) noexcept + { + v128_t one = wasm_f32x4_splat(1.0f); + return wasm_f32x4_div(one, wasm_f32x4_sqrt(self)); + } + template + inline batch rsqrt(batch const& self, requires_arch) noexcept + { + v128_t one = wasm_f64x2_splat(1.0); + return wasm_f64x2_div(one, wasm_f64x2_sqrt(self)); + } + // select template inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { - return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br)); + return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } template ::value, void>::type> inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { - return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br)); + return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } template ::value, void>::type> inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept @@ -391,7 +890,7 @@ namespace xsimd template inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { - return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br)); + return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } // set @@ -439,6 +938,20 @@ namespace xsimd return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; + } + + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; + } + // store_aligned template inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept @@ -535,6 +1048,18 @@ namespace xsimd { return wasm_f64x2_sqrt(val); } + + // trunc + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return wasm_f32x4_trunc(self); + } + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return wasm_f64x2_trunc(self); + } } } From 1402e0e0947a2b60f7a926c428c35ed4abf29f43 Mon Sep 17 00:00:00 2001 From: anutosh491 Date: Wed, 18 Oct 2023 16:03:05 +0530 Subject: [PATCH 03/10] Added the following operation through emulations 1) arithmetic: sadd, ssub, hadd, haddp 2) batch manipulation: zip_lo, zip_hi, slide_left, slide_right 3) math: reduce_add 4) memory: store_complex, load_complex 5) misc: from_mask --- .../arch/generic/xsimd_generic_arithmetic.hpp | 44 ++ include/xsimd/arch/xsimd_generic_fwd.hpp | 6 + include/xsimd/arch/xsimd_wasm.hpp | 400 +++++++++++++++++- test/test_wasm/test_wasm.cpp | 31 -- 4 files changed, 444 insertions(+), 37 deletions(-) delete mode 100644 test/test_wasm/test_wasm.cpp diff --git a/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp index a14a5a2a3..c72e416c6 100644 --- a/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp @@ -127,6 +127,20 @@ namespace xsimd return { res_r, res_i }; } + // hadd + template ::value, void>::type*/> + inline T hadd(batch const& self, requires_arch) noexcept + { + alignas(A::alignment()) T buffer[batch::size]; + self.store_aligned(buffer); + T res = 0; + for (T val : buffer) + { + res += val; + } + return res; + } + // incr template inline batch incr(batch const& self, requires_arch) noexcept @@ -172,6 +186,23 @@ namespace xsimd { return add(self, other); // no saturated arithmetic on floating point numbers } + template ::value, void>::type*/> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = (other >> (8 * sizeof(T) - 1)); + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } template inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { @@ -184,6 +215,19 @@ namespace xsimd { return sub(self, other); // no saturated arithmetic on floating point numbers } + template ::value, void>::type*/> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + return sadd(self, -other); + } + else + { + const auto diff = min(self, other); + return self - diff; + } + } template inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_generic_fwd.hpp b/include/xsimd/arch/xsimd_generic_fwd.hpp index 86e398a5e..87dcaa886 100644 --- a/include/xsimd/arch/xsimd_generic_fwd.hpp +++ b/include/xsimd/arch/xsimd_generic_fwd.hpp @@ -31,6 +31,12 @@ namespace xsimd inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> inline batch mul(batch const& self, batch const& other, requires_arch) noexcept; + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept; + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept; + template ::value, void>::type> + inline T hadd(batch const& self, requires_arch) noexcept; } } diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp index a6f45ee10..f161120c6 100644 --- a/include/xsimd/arch/xsimd_wasm.hpp +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -181,7 +181,7 @@ namespace xsimd { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { - return wasm_v128_and(wasm_i8x16_splat(0xFF << other), wasm_i32x4_shl(self, other)); + return wasm_i8x16_shl(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { @@ -234,7 +234,7 @@ namespace xsimd { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { - return wasm_v128_and(wasm_i8x16_splat(0xFF >> other), wasm_u32x4_shr(self, other)); + return wasm_u8x16_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { @@ -428,6 +428,102 @@ namespace xsimd return wasm_f64x2_floor(self); } + // from_mask + template + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint32_t lut[][4] = { + { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + }; + assert(!(mask & ~0xFul) && "inbound mask"); + return wasm_v128_load((const v128_t*)lut[mask]); + } + template + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint64_t lut[][4] = { + { 0x0000000000000000ul, 0x0000000000000000ul }, + { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, + { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, + { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, + }; + assert(!(mask & ~0x3ul) && "inbound mask"); + return wasm_v128_load((const v128_t*)lut[mask]); + } + template ::value, void>::type> + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint64_t lut64[] = { + 0x0000000000000000, + 0x000000000000FFFF, + 0x00000000FFFF0000, + 0x00000000FFFFFFFF, + 0x0000FFFF00000000, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF, + 0xFFFF000000000000, + 0xFFFF00000000FFFF, + 0xFFFF0000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF0000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFFFFFFFFFF, + }; + alignas(A::alignment()) static const uint32_t lut32[] = { + 0x00000000, + 0x000000FF, + 0x0000FF00, + 0x0000FFFF, + 0x00FF0000, + 0x00FF00FF, + 0x00FFFF00, + 0x00FFFFFF, + 0xFF000000, + 0xFF0000FF, + 0xFF00FF00, + 0xFF00FFFF, + 0xFFFF0000, + 0xFFFF00FF, + 0xFFFFFF00, + 0xFFFFFFFF, + }; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + assert(!(mask & ~0xFFFF) && "inbound mask"); + return wasm_i32x4_make(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + assert(!(mask & ~0xFF) && "inbound mask"); + return wasm_i64x2_make(lut64[mask >> 4], lut64[mask & 0xF]); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return from_mask(batch_bool {}, mask, wasm {}); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return from_mask(batch_bool {}, mask, wasm {}); + } + } + // ge template inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept @@ -467,6 +563,11 @@ namespace xsimd { return wasm_i64x2_gt(self, other); } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } } else { @@ -495,6 +596,27 @@ namespace xsimd return wasm_f64x2_gt(self, other); } + // haddp + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + v128_t tmp0 = wasm_i32x4_shuffle(row[0], row[1], 0, 4, 1, 5); + v128_t tmp1 = wasm_i32x4_shuffle(row[0], row[1], 2, 6, 3, 7); + v128_t tmp2 = wasm_i32x4_shuffle(row[2], row[3], 2, 6, 3, 7); + tmp0 = wasm_f32x4_add(tmp0, tmp1); + tmp1 = wasm_i32x4_shuffle(row[2], row[3], 0, 4, 1, 5); + tmp1 = wasm_f32x4_add(tmp1, tmp2); + tmp2 = wasm_i32x4_shuffle(tmp1, tmp0, 6, 7, 2, 3); + tmp0 = wasm_i32x4_shuffle(tmp0, tmp1, 0, 1, 4, 5); + return wasm_f32x4_add(tmp0, tmp2); + } + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + return wasm_f64x2_add(wasm_i64x2_shuffle(row[0], row[1], 0, 2), + wasm_i64x2_shuffle(row[0], row[1], 1, 3)); + } + // insert template inline batch insert(batch const& self, float val, index pos, requires_arch) noexcept @@ -588,22 +710,34 @@ namespace xsimd template inline batch load_aligned(float const* mem, convert, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem. return wasm_v128_load(mem); } template ::value, void>::type> inline batch load_aligned(T const* mem, convert, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem. return wasm_v128_load((v128_t const*)mem); } template inline batch load_aligned(double const* mem, convert, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem. return wasm_v128_load(mem); } + // load_complex + namespace detail + { + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + return { wasm_i32x4_shuffle(hi, lo, 0, 2, 4, 6), wasm_i32x4_shuffle(hi, lo, 1, 3, 5, 7) }; + } + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + return { wasm_i64x2_shuffle(hi, lo, 0, 2), wasm_i64x2_shuffle(hi, lo, 1, 3) }; + } + } + // load_unaligned template inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept @@ -670,7 +804,15 @@ namespace xsimd } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { - return lt(self, other, generic {}); + auto xself = wasm_v128_xor(self, wasm_i64x2_splat(std::numeric_limits::lowest())); + auto xother = wasm_v128_xor(other, wasm_i64x2_splat(std::numeric_limits::lowest())); + v128_t tmp1 = wasm_i64x2_sub(xself, xother); + v128_t tmp2 = wasm_v128_xor(xself, xother); + v128_t tmp3 = wasm_v128_andnot(xself, xother); + v128_t tmp4 = wasm_v128_andnot(tmp1, tmp2); + v128_t tmp5 = wasm_v128_or(tmp3, tmp4); + v128_t tmp6 = wasm_i32x4_shr(tmp5, 31); + return wasm_i32x4_shuffle(tmp6, wasm_i32x4_splat(0), 1, 1, 3, 3); } else { @@ -856,6 +998,47 @@ namespace xsimd return wasm_f64x2_div(one, self); } + // reduce_add + template + inline float reduce_add(batch const& self, requires_arch) noexcept + { + v128_t tmp0 = wasm_f32x4_add(self, wasm_i32x4_shuffle(self, self, 6, 7, 2, 3)); + v128_t tmp1 = wasm_i32x4_shuffle(tmp0, tmp0, 1, 0, 4, 4); + v128_t tmp2 = wasm_f32x4_add(tmp0, tmp1); + v128_t tmp3 = wasm_i32x4_shuffle(tmp0, tmp2, 4, 1, 2, 3); + return wasm_f32x4_extract_lane(tmp3, 0); + } + template ::value, void>::type> + inline T reduce_add(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); + v128_t tmp1 = wasm_i32x4_add(self, tmp0); + v128_t tmp2 = wasm_i32x4_shuffle(tmp1, wasm_i32x4_splat(0), 1, 0, 0, 0); + v128_t tmp3 = wasm_i32x4_add(tmp1, tmp2); + return wasm_i32x4_extract_lane(tmp3, 0); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); + v128_t tmp1 = wasm_i64x2_add(self, tmp0); + return wasm_i64x2_extract_lane(tmp1, 0); + } + else + { + return hadd(self, generic {}); + } + } + template + inline double reduce_add(batch const& self, requires_arch) noexcept + { + v128_t tmp0 = wasm_i64x2_shuffle(self, self, 1, 3); + v128_t tmp1 = wasm_f64x2_add(self, tmp0); + v128_t tmp2 = wasm_i64x2_shuffle(tmp0, tmp1, 2, 1); + return wasm_f64x2_extract_lane(tmp2, 0); + } + // rsqrt template inline batch rsqrt(batch const& self, requires_arch) noexcept @@ -870,6 +1053,74 @@ namespace xsimd return wasm_f64x2_div(one, wasm_f64x2_sqrt(self)); } + // slide_left + template + inline batch slide_left(batch const& x, requires_arch) noexcept + { + return wasm_i8x16_shuffle( + wasm_i64x2_const(0, 0), x, ((N)&0xF0) ? 0 : 16 - ((N)&0xF), + ((N)&0xF0) ? 0 : 17 - ((N)&0xF), ((N)&0xF0) ? 0 : 18 - ((N)&0xF), + ((N)&0xF0) ? 0 : 19 - ((N)&0xF), ((N)&0xF0) ? 0 : 20 - ((N)&0xF), + ((N)&0xF0) ? 0 : 21 - ((N)&0xF), ((N)&0xF0) ? 0 : 22 - ((N)&0xF), + ((N)&0xF0) ? 0 : 23 - ((N)&0xF), ((N)&0xF0) ? 0 : 24 - ((N)&0xF), + ((N)&0xF0) ? 0 : 25 - ((N)&0xF), ((N)&0xF0) ? 0 : 26 - ((N)&0xF), + ((N)&0xF0) ? 0 : 27 - ((N)&0xF), ((N)&0xF0) ? 0 : 28 - ((N)&0xF), + ((N)&0xF0) ? 0 : 29 - ((N)&0xF), ((N)&0xF0) ? 0 : 30 - ((N)&0xF), + ((N)&0xF0) ? 0 : 31 - ((N)&0xF)); + } + + // slide_right + template + inline batch slide_right(batch const& x, requires_arch) noexcept + { + return wasm_i8x16_shuffle( + x, wasm_i64x2_const(0, 0), ((N)&0xF0) ? 16 : ((N)&0xF) + 0, + ((N)&0xF0) ? 16 : ((N)&0xF) + 1, ((N)&0xF0) ? 16 : ((N)&0xF) + 2, + ((N)&0xF0) ? 16 : ((N)&0xF) + 3, ((N)&0xF0) ? 16 : ((N)&0xF) + 4, + ((N)&0xF0) ? 16 : ((N)&0xF) + 5, ((N)&0xF0) ? 16 : ((N)&0xF) + 6, + ((N)&0xF0) ? 16 : ((N)&0xF) + 7, ((N)&0xF0) ? 16 : ((N)&0xF) + 8, + ((N)&0xF0) ? 16 : ((N)&0xF) + 9, ((N)&0xF0) ? 16 : ((N)&0xF) + 10, + ((N)&0xF0) ? 16 : ((N)&0xF) + 11, ((N)&0xF0) ? 16 : ((N)&0xF) + 12, + ((N)&0xF0) ? 16 : ((N)&0xF) + 13, ((N)&0xF0) ? 16 : ((N)&0xF) + 14, + ((N)&0xF0) ? 16 : ((N)&0xF) + 15); + } + + // sadd + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_add_sat(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_add_sat(self, other); + } + else + { + return sadd(self, other, generic {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_u8x16_add_sat(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_u16x8_add_sat(self, other); + } + else + { + return sadd(self, other, generic {}); + } + } + } + // select template inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept @@ -952,6 +1203,42 @@ namespace xsimd return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } + // ssub + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_sub_sat(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_sub_sat(self, other); + } + else + { + return ssub(self, other, generic {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_u8x16_sub_sat(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_u16x8_sub_sat(self, other); + } + else + { + return ssub(self, other, generic {}); + } + } + } + // store_aligned template inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept @@ -978,6 +1265,33 @@ namespace xsimd return wasm_v128_store(mem, self); } + // store_complex + namespace detail + { + // complex_low + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self.real(), self.imag(), 0, 4, 1, 5); + } + // complex_high + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self.real(), self.imag(), 2, 6, 3, 7); + } + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return wasm_i64x2_shuffle(self.real(), self.imag(), 0, 2); + } + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return wasm_i64x2_shuffle(self.real(), self.imag(), 1, 3); + } + } + // store_unaligned template inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept @@ -1060,6 +1374,80 @@ namespace xsimd { return wasm_f64x2_trunc(self); } + + // zip_hi + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7); + } + template ::value, void>::type> + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_shuffle(self, other, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_shuffle(self, other, 4, 12, 5, 13, 6, 14, 7, 15); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_shuffle(self, other, 1, 3); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_i64x2_shuffle(self, other, 1, 3); + } + + // zip_lo + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5); + } + template ::value, void>::type> + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_i8x16_shuffle(self, other, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_i16x8_shuffle(self, other, 0, 8, 1, 9, 2, 10, 3, 11); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return wasm_i64x2_shuffle(self, other, 0, 2); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return wasm_i64x2_shuffle(self, other, 0, 2); + } } } diff --git a/test/test_wasm/test_wasm.cpp b/test/test_wasm/test_wasm.cpp deleted file mode 100644 index 65ee6f89c..000000000 --- a/test/test_wasm/test_wasm.cpp +++ /dev/null @@ -1,31 +0,0 @@ -#include "xsimd/xsimd.hpp" -#include -#include - -#include -#include // for reporting errors - -using namespace emscripten; - -int test_abs() -{ - std::cout << "test_abs" << std::endl; - auto ans = xsimd::abs(a); - std::cout << ans << std::endl; - return 0; -} - -int run_tests() -{ - // todo add actual tests - if (auto ret = test_(); ret != 0) - { - return ret; - } - return 0; -} - -EMSCRIPTEN_BINDINGS(my_module) -{ - emscripten::function("run_tests", &run_tests); -} \ No newline at end of file From 2898acfbbe4df0184459f41341726914dc81909f Mon Sep 17 00:00:00 2001 From: anutosh491 Date: Wed, 25 Oct 2023 14:40:01 +0530 Subject: [PATCH 04/10] Implemented sadd, ssub & reduce_add through the generic implementation for xsimd_sse2 --- include/xsimd/arch/xsimd_sse2.hpp | 66 +++---------------------------- 1 file changed, 6 insertions(+), 60 deletions(-) diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 17633fd68..ec173f7c9 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1237,22 +1237,7 @@ namespace xsimd batch acc3 = min(acc2, step3); return acc3.get(0); } - // TODO: move this in xsimd_generic - namespace detail - { - template ::value, void>::type> - inline T hadd_default(batch const& self, requires_arch) noexcept - { - alignas(A::alignment()) T buffer[batch::size]; - self.store_aligned(buffer); - T res = 0; - for (T val : buffer) - { - res += val; - } - return res; - } - } + template ::value, void>::type> inline T reduce_add(batch const& self, requires_arch) noexcept { @@ -1280,7 +1265,7 @@ namespace xsimd } else { - return detail::hadd_default(self, A {}); + return hadd(self, generic {}); } } template @@ -1381,28 +1366,6 @@ namespace xsimd // sadd - // TODO: move this in xsimd_generic - namespace detail - { - template ::value, void>::type> - inline batch sadd_default(batch const& self, batch const& other, requires_arch) noexcept - { - if (std::is_signed::value) - { - auto mask = (other >> (8 * sizeof(T) - 1)); - auto self_pos_branch = min(std::numeric_limits::max() - other, self); - auto self_neg_branch = max(std::numeric_limits::min() - other, self); - return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); - } - else - { - const auto diffmax = std::numeric_limits::max() - self; - const auto mindiff = min(diffmax, other); - return self + mindiff; - } - } - } - template ::value, void>::type> inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept { @@ -1418,7 +1381,7 @@ namespace xsimd } else { - return detail::sadd_default(self, other, A {}); + return sadd(self, other, generic {}); } } else @@ -1433,7 +1396,7 @@ namespace xsimd } else { - return detail::sadd_default(self, other, A {}); + return sadd(self, other, generic {}); } } } @@ -1495,23 +1458,6 @@ namespace xsimd } // ssub - // TODO: move this in xsimd_generic - namespace detail - { - template ::value, void>::type> - inline batch ssub_default(batch const& self, batch const& other, requires_arch) noexcept - { - if (std::is_signed::value) - { - return sadd(self, -other); - } - else - { - const auto diff = min(self, other); - return self - diff; - } - } - } template ::value, void>::type> inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept @@ -1528,7 +1474,7 @@ namespace xsimd } else { - return detail::ssub_default(self, other, A {}); + return ssub(self, other, generic {}); } } else @@ -1543,7 +1489,7 @@ namespace xsimd } else { - return detail::ssub_default(self, other, A {}); + return ssub(self, other, generic {}); } } } From 029aa9bfd70497128db41867b779bc05fdec473d Mon Sep 17 00:00:00 2001 From: Johan Mabille Date: Sun, 29 Oct 2023 06:13:16 +0100 Subject: [PATCH 05/10] Fixed RTD build --- docs/environment.yml | 2 +- readthedocs.yml | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/environment.yml b/docs/environment.yml index 63b08b681..2dee10879 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -5,4 +5,4 @@ channels: dependencies: - breathe - #- docutils<0.17 + - sphinx_rtd_theme diff --git a/readthedocs.yml b/readthedocs.yml index 02f0e7baa..38f414be8 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,2 +1,9 @@ +version: 2 + +build: + os: "ubuntu-22.04" + tools: + python: "mambaforge-22.9" + conda: - file: docs/environment.yml + environment: docs/environment.yml From f9dcafb5b502030e801d05af9beba018bdda8068 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 29 Oct 2023 22:48:07 +0100 Subject: [PATCH 06/10] Provide a generic version for float to uint32_t conversion, only if the int32 conversion is available This remove duplicate code for various Intel implementation and should also help for #962. --- .../arch/generic/xsimd_generic_details.hpp | 17 +++++++++++++++++ include/xsimd/arch/xsimd_avx.hpp | 16 ---------------- include/xsimd/arch/xsimd_avx2.hpp | 15 --------------- include/xsimd/arch/xsimd_sse2.hpp | 15 --------------- 4 files changed, 17 insertions(+), 46 deletions(-) diff --git a/include/xsimd/arch/generic/xsimd_generic_details.hpp b/include/xsimd/arch/generic/xsimd_generic_details.hpp index 90d7643ee..3e8c764c6 100644 --- a/include/xsimd/arch/generic/xsimd_generic_details.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_details.hpp @@ -180,6 +180,23 @@ namespace xsimd { return bitwise_cast(self); } + + // Provide a generic uint32_t -> float cast only if we have a + // non-generic int32_t -> float fast_cast + template const&>(), std::declval const&>(), A {}))> + inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept + { + // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse + batch msk_lo(0xFFFF); + batch cnst65536f(65536.0f); + + auto v_lo = batch_cast(v & msk_lo); /* extract the 16 lowest significant bits of self */ + auto v_hi = batch_cast(v >> 16); /* 16 most significant bits of v */ + auto v_lo_flt = batch_cast(v_lo); /* No rounding */ + auto v_hi_flt = batch_cast(v_hi); /* No rounding */ + v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */ + return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ + } } namespace detail diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 76c297211..162e179ee 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -515,22 +515,6 @@ namespace xsimd return _mm256_cvtepi32_ps(self); } - template - inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept - { - // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse - // adapted to avx - __m256i msk_lo = _mm256_set1_epi32(0xFFFF); - __m256 cnst65536f = _mm256_set1_ps(65536.0f); - - __m256i v_lo = bitwise_and(batch(v), batch(msk_lo)); /* extract the 16 lowest significant bits of self */ - __m256i v_hi = bitwise_rshift(batch(v), 16, avx {}); /* 16 most significant bits of v */ - __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */ - __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */ - v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */ - return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ - } - template inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 7403afe33..a5b07ec9d 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -279,21 +279,6 @@ namespace xsimd namespace detail { - template - inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept - { - // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse - __m256i msk_lo = _mm256_set1_epi32(0xFFFF); - __m256 cnst65536f = _mm256_set1_ps(65536.0f); - - __m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self */ - __m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v */ - __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */ - __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */ - v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */ - return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ - } - template inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index ec173f7c9..a2e035dc5 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -541,21 +541,6 @@ namespace xsimd return _mm_cvtepi32_ps(self); } - template - inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept - { - // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse - __m128i msk_lo = _mm_set1_epi32(0xFFFF); - __m128 cnst65536f = _mm_set1_ps(65536.0f); - - __m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self */ - __m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v */ - __m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding */ - __m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding */ - v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding */ - return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ - } - template inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { From 0ba53ef543b3c41e0e5145abe7d8b3b4861a7748 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Tue, 31 Oct 2023 23:46:21 +0100 Subject: [PATCH 07/10] Provide a generic version for uint32_t to float conversion, only if the int32 conversion is available This remove duplicate code for various Intel implementation and should also help for #962. --- .../xsimd/arch/generic/xsimd_generic_details.hpp | 13 +++++++++++++ include/xsimd/arch/xsimd_avx.hpp | 11 ----------- include/xsimd/arch/xsimd_sse2.hpp | 12 ------------ include/xsimd/arch/xsimd_sse4_1.hpp | 11 ----------- 4 files changed, 13 insertions(+), 34 deletions(-) diff --git a/include/xsimd/arch/generic/xsimd_generic_details.hpp b/include/xsimd/arch/generic/xsimd_generic_details.hpp index 3e8c764c6..14c62a089 100644 --- a/include/xsimd/arch/generic/xsimd_generic_details.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_details.hpp @@ -197,6 +197,19 @@ namespace xsimd v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */ return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ } + + // Provide a generic float -> uint32_t cast only if we have a + // non-generic float -> int32_t fast_cast + template const&>(), std::declval const&>(), A {}))> + inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept + { + auto is_large = v >= batch(1u << 31); + auto small = bitwise_cast(batch_cast(v)); + auto large = bitwise_cast( + batch_cast(v - batch(1u << 31)) + ^ batch(1u << 31)); + return bitwise_cast(select(is_large, large, small)); + } } namespace detail diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 162e179ee..5ec1e02d4 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -520,17 +520,6 @@ namespace xsimd { return _mm256_cvttps_epi32(self); } - - template - inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept - { - return _mm256_castps_si256( - _mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)), - _mm256_xor_ps( - _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))), - _mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))), - _mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ))); - } } // decr_if diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index a2e035dc5..1639ba2bf 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -573,18 +573,6 @@ namespace xsimd { return _mm_cvttps_epi32(self); } - - template - inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept - { - __m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31)); - __m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self)); - __m128 rhs = _mm_castsi128_ps(_mm_xor_si128( - _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))), - _mm_set1_epi32(1u << 31))); - return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs))); - } - } // eq diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index c0e2878ef..165a191e4 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -65,17 +65,6 @@ namespace xsimd __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 return _mm_add_pd(f, _mm_castsi128_pd(xL)); } - - template - inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept - { - return _mm_castps_si128( - _mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)), - _mm_castsi128_ps(_mm_xor_si128( - _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))), - _mm_set1_epi32(1u << 31))), - _mm_cmpge_ps(self, _mm_set1_ps(1u << 31)))); - } } // eq From 2eaa6ee9ff56f8ae13e4f6b7cb9d40ee9f84b6b6 Mon Sep 17 00:00:00 2001 From: anutosh491 Date: Wed, 25 Oct 2023 18:16:59 +0530 Subject: [PATCH 08/10] Implemented the following operations for the wasm instruction set: reduce_max, reduce_min, swizzle, shuffle, bitwise_cast, fast_cast --- include/xsimd/arch/xsimd_wasm.hpp | 186 +++++++++++++++++++++++++++++- 1 file changed, 180 insertions(+), 6 deletions(-) diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp index f161120c6..32a5d67c8 100644 --- a/include/xsimd/arch/xsimd_wasm.hpp +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -19,11 +19,25 @@ namespace xsimd { + template + struct batch_bool_constant; + + template + inline batch bitwise_cast(batch const& x) noexcept; + + template + struct batch_constant; namespace kernel { using namespace types; + // fwd + template + inline batch insert(batch const& self, T val, index, requires_arch) noexcept; + template + inline batch shuffle(batch const& x, batch const& y, batch_constant, Indices...>, requires_arch) noexcept; + // abs template ::value && std::is_signed::value, void>::type> inline batch abs(batch const& self, requires_arch) noexcept @@ -136,6 +150,13 @@ namespace xsimd return wasm_i8x16_bitmask(self) != 0; } + // batch_bool_cast + template + inline batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return { bitwise_cast(batch(self.data)).data }; + } + // bitwise_and template inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept @@ -162,6 +183,13 @@ namespace xsimd return wasm_v128_andnot(self, other); } + // bitwise_cast + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return batch(self.data); + } + // bitwise_or template inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept @@ -415,6 +443,53 @@ namespace xsimd return wasm_f64x2_eq(self, other); } + // fast_cast + namespace detail + { + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return wasm_f32x4_convert_i32x4(self); + } + + template + inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to wasm + v128_t xH = wasm_u64x2_shr(x, 32); + xH = wasm_v128_or(xH, wasm_f64x2_splat(19342813113834066795298816.)); // 2^84 + v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); + v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52 + v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(19342813118337666422669312.)); // 2^84 + 2^52 + return wasm_f64x2_add(f, xL); + } + + template + inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to wasm + v128_t xH = wasm_i32x4_shr(x, 16); + xH = wasm_v128_and(xH, wasm_i16x8_make(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); + xH = wasm_i64x2_add(xH, wasm_f64x2_splat(442721857769029238784.)); // 3*2^67 + v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); + v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52 + v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(442726361368656609280.)); // 3*2^67 + 2^52 + return wasm_f64x2_add(f, xL); + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return wasm_i32x4_make( + static_cast(wasm_f32x4_extract_lane(self, 0)), + static_cast(wasm_f32x4_extract_lane(self, 1)), + static_cast(wasm_f32x4_extract_lane(self, 2)), + static_cast(wasm_f32x4_extract_lane(self, 3))); + } + } + // floor template inline batch floor(batch const& self, requires_arch) noexcept @@ -516,11 +591,11 @@ namespace xsimd } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - return from_mask(batch_bool {}, mask, wasm {}); + return batch_bool_cast(from_mask(batch_bool {}, mask, wasm {})); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { - return from_mask(batch_bool {}, mask, wasm {}); + return batch_bool_cast(from_mask(batch_bool {}, mask, wasm {})); } } @@ -1039,6 +1114,44 @@ namespace xsimd return wasm_f64x2_extract_lane(tmp2, 0); } + // reduce_max + template ::type> + inline T reduce_max(batch const& self, requires_arch) noexcept + { + batch step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); + batch acc0 = max(self, step0); + + batch step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0); + batch acc1 = max(acc0, step1); + + batch step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7); + batch acc2 = max(acc1, step2); + if (sizeof(T) == 2) + return acc2.get(0); + batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); + batch acc3 = max(acc2, step3); + return acc3.get(0); + } + + // reduce_min + template ::type> + inline T reduce_min(batch const& self, requires_arch) noexcept + { + batch step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); + batch acc0 = min(self, step0); + + batch step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0); + batch acc1 = min(acc0, step1); + + batch step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7); + batch acc2 = min(acc1, step2); + if (sizeof(T) == 2) + return acc2.get(0); + batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); + batch acc3 = min(acc2, step3); + return acc3.get(0); + } + // rsqrt template inline batch rsqrt(batch const& self, requires_arch) noexcept @@ -1144,6 +1257,33 @@ namespace xsimd return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } + // shuffle + template + inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1, I2, I3> mask, requires_arch) noexcept + { + // shuffle within lane + if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4) + return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3); + + // shuffle within opposite lane + if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4) + return wasm_i32x4_shuffle(y, x, I0, I1, I2, I3); + return shuffle(x, y, mask, generic {}); + } + + template + inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1> mask, requires_arch) noexcept + { + // shuffle within lane + if (I0 < 2 && I1 >= 2) + return wasm_i64x2_shuffle(x, y, I0, I1); + + // shuffle within opposite lane + if (I0 >= 2 && I1 < 2) + return wasm_i64x2_shuffle(y, x, I0, I1); + return shuffle(x, y, mask, generic {}); + } + // set template inline batch set(batch const&, requires_arch, Values... values) noexcept @@ -1243,25 +1383,21 @@ namespace xsimd template inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. return wasm_v128_store(mem, self); } template ::value, void>::type> inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. return wasm_v128_store((v128_t*)mem, self); } template ::value, void>::type> inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. return wasm_v128_store((v128_t*)mem, self); } template inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. return wasm_v128_store(mem, self); } @@ -1363,6 +1499,44 @@ namespace xsimd return wasm_f64x2_sqrt(val); } + // swizzle + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1>, requires_arch) noexcept + { + return wasm_i64x2_shuffle(self, self, V0, V1); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1>, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), V0, V1, V2, V3); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); + } + // trunc template inline batch trunc(batch const& self, requires_arch) noexcept From fc6c3fbd83831c6575c85208c875f1b24574cbfd Mon Sep 17 00:00:00 2001 From: Johan Mabille Date: Thu, 2 Nov 2023 09:28:11 +0100 Subject: [PATCH 09/10] Fixed jQuery not loaded in RTD page --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index e0af0a0d7..4471fc992 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,7 +17,7 @@ def setup(app): app.add_css_file("main_stylesheet.css") -extensions = ['breathe'] +extensions = ['breathe', 'sphinx_rtd_theme'] breathe_projects = { 'xsimd': '../xml' } templates_path = ['_templates'] html_static_path = ['_static'] From 54278ed4cae61bc782896ccbc722956fa7db027f Mon Sep 17 00:00:00 2001 From: Johan Mabille Date: Thu, 2 Nov 2023 11:05:09 +0100 Subject: [PATCH 10/10] Updated to last Intel SDE --- .github/workflows/linux.yml | 2 +- install_sde.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 7b1631de7..044a7cb18 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -117,7 +117,7 @@ jobs: cd _build cd test if [[ '${{ matrix.sys.flags }}' == 'avx512' || '${{ matrix.sys.flags }}' == 'avx512cd' ]]; then - ../../sde-external-8.56.0-2020-07-05-lin/sde64 -skx -- ./test_xsimd + ../../sde-external-8.69.1-2021-07-18-lin/sde64 -skx -- ./test_xsimd else ./test_xsimd fi diff --git a/install_sde.sh b/install_sde.sh index 9e866316c..934b675e8 100644 --- a/install_sde.sh +++ b/install_sde.sh @@ -4,7 +4,7 @@ #python ./intel-sde-downloader.py sde-external-8.35.0-2019-03-11-lin.tar.bz2 #wget http://software.intel.com/content/dam/develop/external/us/en/protected/sde-external-8.50.0-2020-03-26-lin.tar.bz2 -wget --user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36" https://www.intel.com/content/dam/develop/external/us/en/documents/sde-external-8.56.0-2020-07-05-lin.tar.bz2 +wget --user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36" https://github.com/xtensor-stack/xsimd-testing-resources/releases/download/1.0.0/sde-external-8.69.1-2021-07-18-lin.tar.bz2 -tar xvf sde-external-8.56.0-2020-07-05-lin.tar.bz2 +tar xvf sde-external-8.69.1-2021-07-18-lin.tar.bz2 sudo sh -c "echo 0 > /proc/sys/kernel/yama/ptrace_scope"