From e92cd8667231f25d88d233ceca04a64b063e33f3 Mon Sep 17 00:00:00 2001
From: anutosh491 <andersonbhat491@gmail.com>
Date: Mon, 9 Oct 2023 10:56:14 +0530
Subject: [PATCH 01/10] Initial Implementation for the new WASM based
 instruction set

---
 include/xsimd/arch/xsimd_isa.hpp            |   4 +
 include/xsimd/arch/xsimd_wasm.hpp           | 541 ++++++++++++++++++++
 include/xsimd/config/xsimd_arch.hpp         |   3 +-
 include/xsimd/config/xsimd_config.hpp       |  14 +-
 include/xsimd/types/xsimd_all_registers.hpp |   2 +
 include/xsimd/types/xsimd_wasm_register.hpp |  61 +++
 test/test_wasm/test_wasm.cpp                |  31 ++
 7 files changed, 653 insertions(+), 3 deletions(-)
 create mode 100644 include/xsimd/arch/xsimd_wasm.hpp
 create mode 100644 include/xsimd/types/xsimd_wasm_register.hpp
 create mode 100644 test/test_wasm/test_wasm.cpp
diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp
index cf0f796a1..8f05a5dab 100644
--- a/include/xsimd/arch/xsimd_isa.hpp
+++ b/include/xsimd/arch/xsimd_isa.hpp
@@ -80,6 +80,10 @@
 #include "./xsimd_sve.hpp"
 #endif
 
+#if XSIMD_WITH_WASM
+#include "./xsimd_wasm.hpp"
+#endif
+
 // Must come last to have access to all conversion specializations.
 #include "./xsimd_generic.hpp"
 
diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp
new file mode 100644
index 000000000..63d77ab19
--- /dev/null
+++ b/include/xsimd/arch/xsimd_wasm.hpp
@@ -0,0 +1,541 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Anutosh Bhat                                               *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_WASM_HPP
+#define XSIMD_WASM_HPP
+
+#include <type_traits>
+
+#include "../types/xsimd_wasm_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // abs
+        template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_abs(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_abs(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_abs(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_abs(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> abs(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_abs(self, other);
+        }
+
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_abs(self, other);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_add(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_add(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_add(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_add(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_add(self, other);
+        }
+
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_add(self, other);
+        }
+
+        // all
+        template <class A>
+        inline bool all(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_bitmask(self) == 0x0F;
+        }
+        template <class A>
+        inline bool all(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_bitmask(self) == 0x03;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_bitmask(self) == 0xFFFF;
+        }
+
+        // any
+        template <class A>
+        inline bool any(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_bitmask(self) != 0;
+        }
+        template <class A>
+        inline bool any(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_bitmask(self) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_bitmask(self) != 0;
+        }
+
+        // bitwise_and
+        template <class A, class T>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_and(self, other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_and(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_andnot(self, other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_andnot(self, other);
+        }
+
+        // bitwise_or
+        template <class A, class T>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(self, other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_v128_and(wasm_i8x16_splat(0xFF << other), wasm_i32x4_shl(self, other));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_shl(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_shl(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_shl(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // bitwise_xor
+        template <class A, class T>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_xor(self, other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_xor(self, other);
+        }
+
+        // broadcast
+        template <class A>
+        batch<float, A> inline broadcast(float val, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_splat(val);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> broadcast(T val, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_splat(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_splat(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_splat(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_splat(val);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> broadcast(double val, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_splat(val);
+        }
+
+        // div
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_div(self, other);
+        }
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_div(self, other);
+        }
+
+        // ge
+        template <class A>
+        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_ge(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_ge(self, other);
+        }
+
+        // le
+        template <class A>
+        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_le(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_le(self, other);
+        }
+
+        // load_aligned
+        template <class A>
+        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
+        {
+            // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem.
+            return wasm_v128_load(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
+        {
+            // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem.
+            return wasm_v128_load((v128_t const*)mem);
+        }
+        template <class A>
+        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
+        {
+            // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem.
+            return wasm_v128_load(mem);
+        }
+
+        // load_unaligned
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_load(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_load((v128_t const*)mem);
+        }
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_load(mem);
+        }
+
+        // max
+        template <class A>
+        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_pmax(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_pmax(self, other);
+        }
+
+        // min
+        template <class A>
+        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_pmin(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_pmin(self, other);
+        }
+
+        // mul
+        template <class A>
+        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_mul(self, other);
+        }
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_mul(self, other);
+        }
+
+        // select
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br));
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, wasm {});
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br));
+        }
+
+        // set
+        template <class A, class... Values>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return wasm_f32x4_make(values...);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1) noexcept
+        {
+            return wasm_i64x2_make(v0, v1);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return wasm_i32x4_make(v0, v1, v2, v3);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return wasm_i16x8_make(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return wasm_i8x16_make(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+
+        template <class A, class... Values>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return wasm_f64x2_make(values...);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        // store_aligned
+        template <class A>
+        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
+            return wasm_v128_store(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
+            return wasm_v128_store((v128_t*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
+            return wasm_v128_store((v128_t*)mem, self);
+        }
+        template <class A>
+        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
+            return wasm_v128_store(mem, self);
+        }
+
+        // store_unaligned
+        template <class A>
+        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store((v128_t*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store((v128_t*)mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store(mem, self);
+        }
+
+        // sub
+        template <class A>
+        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_sub(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_sub(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_sub(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_sub(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_sub(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_sub(self, other);
+        }
+
+        // sqrt
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_sqrt(val);
+        }
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_sqrt(val);
+        }
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp
index 81cada583..ab9ecbc29 100644
--- a/include/xsimd/config/xsimd_arch.hpp
+++ b/include/xsimd/config/xsimd_arch.hpp
@@ -193,7 +193,8 @@ namespace xsimd
     using all_x86_architectures = arch_list<avx512bw, avx512dq, avx512cd, avx512f, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
     using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
     using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
-    using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures>::type;
+    using all_wasm_architectures = arch_list<wasm>;
+    using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures, all_wasm_architectures>::type;
 
     using supported_architectures = typename detail::supported<all_architectures>::type;
 
diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp
index d36ed1ee5..27824f3ae 100644
--- a/include/xsimd/config/xsimd_config.hpp
+++ b/include/xsimd/config/xsimd_config.hpp
@@ -285,6 +285,17 @@
 #define XSIMD_SVE_BITS 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if WebAssembly SIMD is available at compile-time, to 0 otherwise.
+ */
+#ifdef __EMSCRIPTEN__
+#define XSIMD_WITH_WASM 1
+#else
+#define XSIMD_WITH_WASM 0
+#endif
+
 // Workaround for MSVC compiler
 #ifdef _MSC_VER
 
@@ -343,8 +354,7 @@
 
 #endif
 
-#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
-#define XSIMD_NO_SUPPORTED_ARCHITECTURE
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_WASM
 #endif
 
 #endif
diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp
index 1fe077325..ec20ce5fb 100644
--- a/include/xsimd/types/xsimd_all_registers.hpp
+++ b/include/xsimd/types/xsimd_all_registers.hpp
@@ -30,3 +30,5 @@
 #include "xsimd_neon_register.hpp"
 
 #include "xsimd_sve_register.hpp"
+
+#include "xsimd_wasm_register.hpp"
diff --git a/include/xsimd/types/xsimd_wasm_register.hpp b/include/xsimd/types/xsimd_wasm_register.hpp
new file mode 100644
index 000000000..ab8782ac6
--- /dev/null
+++ b/include/xsimd/types/xsimd_wasm_register.hpp
@@ -0,0 +1,61 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Anutosh Bhat                                               *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_WASM_REGISTER_HPP
+#define XSIMD_WASM_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_WASM
+#include <wasm_simd128.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * WASM instructions
+     */
+    struct wasm : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_WASM; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(10, 0, 0); }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "wasm"; }
+    };
+
+#if XSIMD_WITH_WASM
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER(bool, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(char, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(short, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(float, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(double, wasm, v128_t);
+    }
+#endif
+}
+
+#endif
diff --git a/test/test_wasm/test_wasm.cpp b/test/test_wasm/test_wasm.cpp
new file mode 100644
index 000000000..65ee6f89c
--- /dev/null
+++ b/test/test_wasm/test_wasm.cpp
@@ -0,0 +1,31 @@
+#include "xsimd/xsimd.hpp"
+#include <wasm_simd128.h>
+#include <xmmintrin.h>
+
+#include <emscripten/bind.h>
+#include <iostream> // for reporting errors
+
+using namespace emscripten;
+
+int test_abs()
+{
+    std::cout << "test_abs" << std::endl;
+    auto ans = xsimd::abs(a);
+    std::cout << ans << std::endl;
+    return 0;
+}
+
+int run_tests()
+{
+    // todo add actual tests
+    if (auto ret = test_(); ret != 0)
+    {
+        return ret;
+    }
+    return 0;
+}
+
+EMSCRIPTEN_BINDINGS(my_module)
+{
+    emscripten::function("run_tests", &run_tests);
+}
\ No newline at end of file

From 6352536f8944e49ac6654431f467d368d2469e06 Mon Sep 17 00:00:00 2001
From: anutosh491 <andersonbhat491@gmail.com>
Date: Mon, 9 Oct 2023 12:57:46 +0530
Subject: [PATCH 02/10] Added the following operations through direct
 intrinsics 1) Bitwise: bitwise_rshift, bitwise_not, bitwise_and, bitwise_or,
 bitwise_lshift, bitwise_xor, bitwise_andnot 2) Logical: gt, lt, eq, neq, all,
 any, ge, le 3) Arithmetic: add, sub, mul, div, neg, reciprocal 4) Math: abs,
 sqrt, rsqrt, max, min 5) Roudning: floor, ceil , trunc 6) Memory:
 store_aligned, store_unaligned, load_aligned, load_unaligned, set 7) Complex:
 isnan 7) Misc: mask, select, broadcast, insert

---
 include/xsimd/arch/xsimd_wasm.hpp | 539 +++++++++++++++++++++++++++++-
 1 file changed, 532 insertions(+), 7 deletions(-)

diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp
index 63d77ab19..a6f45ee10 100644
--- a/include/xsimd/arch/xsimd_wasm.hpp
+++ b/include/xsimd/arch/xsimd_wasm.hpp
@@ -52,15 +52,15 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> abs(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
-            return wasm_f32x4_abs(self, other);
+            return wasm_f32x4_abs(self);
         }
 
         template <class A>
-        inline batch<double, A> abs(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
-            return wasm_f64x2_abs(self, other);
+            return wasm_f64x2_abs(self);
         }
 
         // add
@@ -202,6 +202,73 @@ namespace xsimd
             }
         }
 
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_i32x4_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_i64x2_shr(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_v128_and(wasm_i8x16_splat(0xFF >> other), wasm_u32x4_shr(self, other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_u32x4_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_u64x2_shr(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        // bitwise_not
+        template <class A, class T>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_not(self);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_not(self);
+        }
+
         // bitwise_xor
         template <class A, class T>
         inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
@@ -252,6 +319,18 @@ namespace xsimd
             return wasm_f64x2_splat(val);
         }
 
+        // ceil
+        template <class A>
+        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_ceil(self);
+        }
+        template <class A>
+        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_ceil(self);
+        }
+
         // div
         template <class A>
         inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
@@ -264,6 +343,91 @@ namespace xsimd
             return wasm_f64x2_div(self, other);
         }
 
+        // eq
+        template <class A>
+        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_eq(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_eq(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_eq(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_eq(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_eq(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_eq(self, other);
+        }
+
+        // floor
+        template <class A>
+        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_floor(self);
+        }
+
+        template <class A>
+        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_floor(self);
+        }
+
         // ge
         template <class A>
         inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
@@ -276,6 +440,138 @@ namespace xsimd
             return wasm_f64x2_ge(self, other);
         }
 
+        // gt
+        template <class A>
+        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_gt(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_i32x4_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_i64x2_gt(self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_u32x4_gt(self, other);
+                }
+                else
+                {
+                    return gt(self, other, generic {});
+                }
+            }
+        }
+
+        template <class A>
+        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_gt(self, other);
+        }
+
+        // insert
+        template <class A, size_t I>
+        inline batch<float, A> insert(batch<float, A> const& self, float val, index<I> pos, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_replace_lane(self, pos, val);
+        }
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_i32x4_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_i64x2_replace_lane(self, pos, val);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_u32x4_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_u64x2_replace_lane(self, pos, val);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        template <class A, size_t I>
+        inline batch<double, A> insert(batch<double, A> const& self, double val, index<I> pos, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_replace_lane(self, pos, val);
+        }
+
+        // isnan
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_f32x4_ne(self, self), wasm_f32x4_ne(self, self));
+        }
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_f64x2_ne(self, self), wasm_f64x2_ne(self, self));
+        }
+
         // le
         template <class A>
         inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
@@ -325,6 +621,109 @@ namespace xsimd
             return wasm_v128_load(mem);
         }
 
+        // lt
+        template <class A>
+        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_lt(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_i32x4_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_i64x2_lt(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_u32x4_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return lt(self, other, generic {});
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_lt(self, other);
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_bitmask(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_bitmask(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_bitmask(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_bitmask(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_bitmask(self);
+        }
+
+        template <class A>
+        inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_bitmask(self);
+        }
+
         // max
         template <class A>
         inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
@@ -371,17 +770,117 @@ namespace xsimd
             return wasm_f64x2_mul(self, other);
         }
 
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_neg(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_neg(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_neg(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_neg(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> neg(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_neg(self);
+        }
+
+        template <class A>
+        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_neg(self);
+        }
+
+        // neq
+        template <class A>
+        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_ne(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return ~(self == other);
+        }
+        template <class A>
+        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_ne(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return ~(self == other);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_ne(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_ne(self, other);
+        }
+
+        // reciprocal
+        template <class A>
+        inline batch<float, A> reciprocal(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t one = wasm_f32x4_splat(1.0f);
+            return wasm_f32x4_div(one, self);
+        }
+        template <class A>
+        inline batch<double, A> reciprocal(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t one = wasm_f64x2_splat(1.0);
+            return wasm_f64x2_div(one, self);
+        }
+
+        // rsqrt
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t one = wasm_f32x4_splat(1.0f);
+            return wasm_f32x4_div(one, wasm_f32x4_sqrt(self));
+        }
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t one = wasm_f64x2_splat(1.0);
+            return wasm_f64x2_div(one, wasm_f64x2_sqrt(self));
+        }
+
         // select
         template <class A>
         inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<wasm>) noexcept
         {
-            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br));
+            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
         {
-            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br));
+            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
@@ -391,7 +890,7 @@ namespace xsimd
         template <class A>
         inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<wasm>) noexcept
         {
-            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(cond, false_br));
+            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
         }
 
         // set
@@ -439,6 +938,20 @@ namespace xsimd
             return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
         }
 
+        template <class A, class... Values>
+        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data;
+        }
+
         // store_aligned
         template <class A>
         inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -535,6 +1048,18 @@ namespace xsimd
         {
             return wasm_f64x2_sqrt(val);
         }
+
+        // trunc
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_trunc(self);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_trunc(self);
+        }
     }
 }
 

From 1402e0e0947a2b60f7a926c428c35ed4abf29f43 Mon Sep 17 00:00:00 2001
From: anutosh491 <andersonbhat491@gmail.com>
Date: Wed, 18 Oct 2023 16:03:05 +0530
Subject: [PATCH 03/10] Added the following operation through emulations 1)
 arithmetic: sadd, ssub, hadd, haddp 2) batch manipulation: zip_lo, zip_hi,
 slide_left, slide_right 3) math: reduce_add 4) memory: store_complex,
 load_complex 5) misc: from_mask

---
 .../arch/generic/xsimd_generic_arithmetic.hpp |  44 ++
 include/xsimd/arch/xsimd_generic_fwd.hpp      |   6 +
 include/xsimd/arch/xsimd_wasm.hpp             | 400 +++++++++++++++++-
 test/test_wasm/test_wasm.cpp                  |  31 --
 4 files changed, 444 insertions(+), 37 deletions(-)
 delete mode 100644 test/test_wasm/test_wasm.cpp

diff --git a/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
index a14a5a2a3..c72e416c6 100644
--- a/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
+++ b/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
@@ -127,6 +127,20 @@ namespace xsimd
             return { res_r, res_i };
         }
 
+        // hadd
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(buffer);
+            T res = 0;
+            for (T val : buffer)
+            {
+                res += val;
+            }
+            return res;
+        }
+
         // incr
         template <class A, class T>
         inline batch<T, A> incr(batch<T, A> const& self, requires_arch<generic>) noexcept
@@ -172,6 +186,23 @@ namespace xsimd
         {
             return add(self, other); // no saturated arithmetic on floating point numbers
         }
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = (other >> (8 * sizeof(T) - 1));
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
         template <class A>
         inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
         {
@@ -184,6 +215,19 @@ namespace xsimd
         {
             return sub(self, other); // no saturated arithmetic on floating point numbers
         }
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
         template <class A>
         inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
         {
diff --git a/include/xsimd/arch/xsimd_generic_fwd.hpp b/include/xsimd/arch/xsimd_generic_fwd.hpp
index 86e398a5e..87dcaa886 100644
--- a/include/xsimd/arch/xsimd_generic_fwd.hpp
+++ b/include/xsimd/arch/xsimd_generic_fwd.hpp
@@ -31,6 +31,12 @@ namespace xsimd
         inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept;
 
     }
 }
diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp
index a6f45ee10..f161120c6 100644
--- a/include/xsimd/arch/xsimd_wasm.hpp
+++ b/include/xsimd/arch/xsimd_wasm.hpp
@@ -181,7 +181,7 @@ namespace xsimd
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
-                return wasm_v128_and(wasm_i8x16_splat(0xFF << other), wasm_i32x4_shl(self, other));
+                return wasm_i8x16_shl(self, other);
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
@@ -234,7 +234,7 @@ namespace xsimd
             {
                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
                 {
-                    return wasm_v128_and(wasm_i8x16_splat(0xFF >> other), wasm_u32x4_shr(self, other));
+                    return wasm_u8x16_shr(self, other);
                 }
                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
                 {
@@ -428,6 +428,102 @@ namespace xsimd
             return wasm_f64x2_floor(self);
         }
 
+        // from_mask
+        template <class A>
+        inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut[][4] = {
+                { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return wasm_v128_load((const v128_t*)lut[mask]);
+        }
+        template <class A>
+        inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0x3ul) && "inbound mask");
+            return wasm_v128_load((const v128_t*)lut[mask]);
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000,
+                0x000000000000FFFF,
+                0x00000000FFFF0000,
+                0x00000000FFFFFFFF,
+                0x0000FFFF00000000,
+                0x0000FFFF0000FFFF,
+                0x0000FFFFFFFF0000,
+                0x0000FFFFFFFFFFFF,
+                0xFFFF000000000000,
+                0xFFFF00000000FFFF,
+                0xFFFF0000FFFF0000,
+                0xFFFF0000FFFFFFFF,
+                0xFFFFFFFF00000000,
+                0xFFFFFFFF0000FFFF,
+                0xFFFFFFFFFFFF0000,
+                0xFFFFFFFFFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFF) && "inbound mask");
+                return wasm_i32x4_make(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFF) && "inbound mask");
+                return wasm_i64x2_make(lut64[mask >> 4], lut64[mask & 0xF]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return from_mask(batch_bool<float, A> {}, mask, wasm {});
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return from_mask(batch_bool<double, A> {}, mask, wasm {});
+            }
+        }
+
         // ge
         template <class A>
         inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
@@ -467,6 +563,11 @@ namespace xsimd
                 {
                     return wasm_i64x2_gt(self, other);
                 }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
             }
             else
             {
@@ -495,6 +596,27 @@ namespace xsimd
             return wasm_f64x2_gt(self, other);
         }
 
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<wasm>) noexcept
+        {
+            v128_t tmp0 = wasm_i32x4_shuffle(row[0], row[1], 0, 4, 1, 5);
+            v128_t tmp1 = wasm_i32x4_shuffle(row[0], row[1], 2, 6, 3, 7);
+            v128_t tmp2 = wasm_i32x4_shuffle(row[2], row[3], 2, 6, 3, 7);
+            tmp0 = wasm_f32x4_add(tmp0, tmp1);
+            tmp1 = wasm_i32x4_shuffle(row[2], row[3], 0, 4, 1, 5);
+            tmp1 = wasm_f32x4_add(tmp1, tmp2);
+            tmp2 = wasm_i32x4_shuffle(tmp1, tmp0, 6, 7, 2, 3);
+            tmp0 = wasm_i32x4_shuffle(tmp0, tmp1, 0, 1, 4, 5);
+            return wasm_f32x4_add(tmp0, tmp2);
+        }
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_add(wasm_i64x2_shuffle(row[0], row[1], 0, 2),
+                                  wasm_i64x2_shuffle(row[0], row[1], 1, 3));
+        }
+
         // insert
         template <class A, size_t I>
         inline batch<float, A> insert(batch<float, A> const& self, float val, index<I> pos, requires_arch<wasm>) noexcept
@@ -588,22 +710,34 @@ namespace xsimd
         template <class A>
         inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
         {
-            // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem.
             return wasm_v128_load(mem);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
         {
-            // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem.
             return wasm_v128_load((v128_t const*)mem);
         }
         template <class A>
         inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
         {
-            // Assuming that mem is aligned properly, you can use wasm_v128_load to load the mem.
             return wasm_v128_load(mem);
         }
 
+        // load_complex
+        namespace detail
+        {
+            template <class A>
+            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<wasm>) noexcept
+            {
+                return { wasm_i32x4_shuffle(hi, lo, 0, 2, 4, 6), wasm_i32x4_shuffle(hi, lo, 1, 3, 5, 7) };
+            }
+            template <class A>
+            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<wasm>) noexcept
+            {
+                return { wasm_i64x2_shuffle(hi, lo, 0, 2), wasm_i64x2_shuffle(hi, lo, 1, 3) };
+            }
+        }
+
         // load_unaligned
         template <class A>
         inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
@@ -670,7 +804,15 @@ namespace xsimd
                 }
                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
                 {
-                    return lt(self, other, generic {});
+                    auto xself = wasm_v128_xor(self, wasm_i64x2_splat(std::numeric_limits<int64_t>::lowest()));
+                    auto xother = wasm_v128_xor(other, wasm_i64x2_splat(std::numeric_limits<int64_t>::lowest()));
+                    v128_t tmp1 = wasm_i64x2_sub(xself, xother);
+                    v128_t tmp2 = wasm_v128_xor(xself, xother);
+                    v128_t tmp3 = wasm_v128_andnot(xself, xother);
+                    v128_t tmp4 = wasm_v128_andnot(tmp1, tmp2);
+                    v128_t tmp5 = wasm_v128_or(tmp3, tmp4);
+                    v128_t tmp6 = wasm_i32x4_shr(tmp5, 31);
+                    return wasm_i32x4_shuffle(tmp6, wasm_i32x4_splat(0), 1, 1, 3, 3);
                 }
                 else
                 {
@@ -856,6 +998,47 @@ namespace xsimd
             return wasm_f64x2_div(one, self);
         }
 
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t tmp0 = wasm_f32x4_add(self, wasm_i32x4_shuffle(self, self, 6, 7, 2, 3));
+            v128_t tmp1 = wasm_i32x4_shuffle(tmp0, tmp0, 1, 0, 4, 4);
+            v128_t tmp2 = wasm_f32x4_add(tmp0, tmp1);
+            v128_t tmp3 = wasm_i32x4_shuffle(tmp0, tmp2, 4, 1, 2, 3);
+            return wasm_f32x4_extract_lane(tmp3, 0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
+                v128_t tmp1 = wasm_i32x4_add(self, tmp0);
+                v128_t tmp2 = wasm_i32x4_shuffle(tmp1, wasm_i32x4_splat(0), 1, 0, 0, 0);
+                v128_t tmp3 = wasm_i32x4_add(tmp1, tmp2);
+                return wasm_i32x4_extract_lane(tmp3, 0);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
+                v128_t tmp1 = wasm_i64x2_add(self, tmp0);
+                return wasm_i64x2_extract_lane(tmp1, 0);
+            }
+            else
+            {
+                return hadd(self, generic {});
+            }
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t tmp0 = wasm_i64x2_shuffle(self, self, 1, 3);
+            v128_t tmp1 = wasm_f64x2_add(self, tmp0);
+            v128_t tmp2 = wasm_i64x2_shuffle(tmp0, tmp1, 2, 1);
+            return wasm_f64x2_extract_lane(tmp2, 0);
+        }
+
         // rsqrt
         template <class A>
         inline batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -870,6 +1053,74 @@ namespace xsimd
             return wasm_f64x2_div(one, wasm_f64x2_sqrt(self));
         }
 
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_shuffle(
+                wasm_i64x2_const(0, 0), x, ((N)&0xF0) ? 0 : 16 - ((N)&0xF),
+                ((N)&0xF0) ? 0 : 17 - ((N)&0xF), ((N)&0xF0) ? 0 : 18 - ((N)&0xF),
+                ((N)&0xF0) ? 0 : 19 - ((N)&0xF), ((N)&0xF0) ? 0 : 20 - ((N)&0xF),
+                ((N)&0xF0) ? 0 : 21 - ((N)&0xF), ((N)&0xF0) ? 0 : 22 - ((N)&0xF),
+                ((N)&0xF0) ? 0 : 23 - ((N)&0xF), ((N)&0xF0) ? 0 : 24 - ((N)&0xF),
+                ((N)&0xF0) ? 0 : 25 - ((N)&0xF), ((N)&0xF0) ? 0 : 26 - ((N)&0xF),
+                ((N)&0xF0) ? 0 : 27 - ((N)&0xF), ((N)&0xF0) ? 0 : 28 - ((N)&0xF),
+                ((N)&0xF0) ? 0 : 29 - ((N)&0xF), ((N)&0xF0) ? 0 : 30 - ((N)&0xF),
+                ((N)&0xF0) ? 0 : 31 - ((N)&0xF));
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_shuffle(
+                x, wasm_i64x2_const(0, 0), ((N)&0xF0) ? 16 : ((N)&0xF) + 0,
+                ((N)&0xF0) ? 16 : ((N)&0xF) + 1, ((N)&0xF0) ? 16 : ((N)&0xF) + 2,
+                ((N)&0xF0) ? 16 : ((N)&0xF) + 3, ((N)&0xF0) ? 16 : ((N)&0xF) + 4,
+                ((N)&0xF0) ? 16 : ((N)&0xF) + 5, ((N)&0xF0) ? 16 : ((N)&0xF) + 6,
+                ((N)&0xF0) ? 16 : ((N)&0xF) + 7, ((N)&0xF0) ? 16 : ((N)&0xF) + 8,
+                ((N)&0xF0) ? 16 : ((N)&0xF) + 9, ((N)&0xF0) ? 16 : ((N)&0xF) + 10,
+                ((N)&0xF0) ? 16 : ((N)&0xF) + 11, ((N)&0xF0) ? 16 : ((N)&0xF) + 12,
+                ((N)&0xF0) ? 16 : ((N)&0xF) + 13, ((N)&0xF0) ? 16 : ((N)&0xF) + 14,
+                ((N)&0xF0) ? 16 : ((N)&0xF) + 15);
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_add_sat(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_add_sat(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, generic {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_add_sat(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_add_sat(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, generic {});
+                }
+            }
+        }
+
         // select
         template <class A>
         inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<wasm>) noexcept
@@ -952,6 +1203,42 @@ namespace xsimd
             return set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data;
         }
 
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_sub_sat(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_sub_sat(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, generic {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_sub_sat(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_sub_sat(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, generic {});
+                }
+            }
+        }
+
         // store_aligned
         template <class A>
         inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -978,6 +1265,33 @@ namespace xsimd
             return wasm_v128_store(mem, self);
         }
 
+        // store_complex
+        namespace detail
+        {
+            // complex_low
+            template <class A>
+            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
+            {
+                return wasm_i32x4_shuffle(self.real(), self.imag(), 0, 4, 1, 5);
+            }
+            // complex_high
+            template <class A>
+            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
+            {
+                return wasm_i32x4_shuffle(self.real(), self.imag(), 2, 6, 3, 7);
+            }
+            template <class A>
+            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
+            {
+                return wasm_i64x2_shuffle(self.real(), self.imag(), 0, 2);
+            }
+            template <class A>
+            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
+            {
+                return wasm_i64x2_shuffle(self.real(), self.imag(), 1, 3);
+            }
+        }
+
         // store_unaligned
         template <class A>
         inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -1060,6 +1374,80 @@ namespace xsimd
         {
             return wasm_f64x2_trunc(self);
         }
+
+        // zip_hi
+        template <class A>
+        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_shuffle(self, other, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_shuffle(self, other, 4, 12, 5, 13, 6, 14, 7, 15);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_shuffle(self, other, 1, 3);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_shuffle(self, other, 1, 3);
+        }
+
+        // zip_lo
+        template <class A>
+        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_shuffle(self, other, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_shuffle(self, other, 0, 8, 1, 9, 2, 10, 3, 11);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_shuffle(self, other, 0, 2);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_shuffle(self, other, 0, 2);
+        }
     }
 }
 
diff --git a/test/test_wasm/test_wasm.cpp b/test/test_wasm/test_wasm.cpp
deleted file mode 100644
index 65ee6f89c..000000000
--- a/test/test_wasm/test_wasm.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#include "xsimd/xsimd.hpp"
-#include <wasm_simd128.h>
-#include <xmmintrin.h>
-
-#include <emscripten/bind.h>
-#include <iostream> // for reporting errors
-
-using namespace emscripten;
-
-int test_abs()
-{
-    std::cout << "test_abs" << std::endl;
-    auto ans = xsimd::abs(a);
-    std::cout << ans << std::endl;
-    return 0;
-}
-
-int run_tests()
-{
-    // todo add actual tests
-    if (auto ret = test_(); ret != 0)
-    {
-        return ret;
-    }
-    return 0;
-}
-
-EMSCRIPTEN_BINDINGS(my_module)
-{
-    emscripten::function("run_tests", &run_tests);
-}
\ No newline at end of file

From 2898acfbbe4df0184459f41341726914dc81909f Mon Sep 17 00:00:00 2001
From: anutosh491 <andersonbhat491@gmail.com>
Date: Wed, 25 Oct 2023 14:40:01 +0530
Subject: [PATCH 04/10] Implemented sadd, ssub & reduce_add through the generic
 implementation for xsimd_sse2

---
 include/xsimd/arch/xsimd_sse2.hpp | 66 +++----------------------------
 1 file changed, 6 insertions(+), 60 deletions(-)

diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
index 17633fd68..ec173f7c9 100644
--- a/include/xsimd/arch/xsimd_sse2.hpp
+++ b/include/xsimd/arch/xsimd_sse2.hpp
@@ -1237,22 +1237,7 @@ namespace xsimd
             batch<T, A> acc3 = min(acc2, step3);
             return acc3.get(0);
         }
-        // TODO: move this in xsimd_generic
-        namespace detail
-        {
-            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-            inline T hadd_default(batch<T, A> const& self, requires_arch<sse2>) noexcept
-            {
-                alignas(A::alignment()) T buffer[batch<T, A>::size];
-                self.store_aligned(buffer);
-                T res = 0;
-                for (T val : buffer)
-                {
-                    res += val;
-                }
-                return res;
-            }
-        }
+
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
         {
@@ -1280,7 +1265,7 @@ namespace xsimd
             }
             else
             {
-                return detail::hadd_default(self, A {});
+                return hadd(self, generic {});
             }
         }
         template <class A>
@@ -1381,28 +1366,6 @@ namespace xsimd
 
         // sadd
 
-        // TODO: move this in xsimd_generic
-        namespace detail
-        {
-            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-            inline batch<T, A> sadd_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
-            {
-                if (std::is_signed<T>::value)
-                {
-                    auto mask = (other >> (8 * sizeof(T) - 1));
-                    auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
-                    auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
-                    return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
-                }
-                else
-                {
-                    const auto diffmax = std::numeric_limits<T>::max() - self;
-                    const auto mindiff = min(diffmax, other);
-                    return self + mindiff;
-                }
-            }
-        }
-
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
@@ -1418,7 +1381,7 @@ namespace xsimd
                 }
                 else
                 {
-                    return detail::sadd_default(self, other, A {});
+                    return sadd(self, other, generic {});
                 }
             }
             else
@@ -1433,7 +1396,7 @@ namespace xsimd
                 }
                 else
                 {
-                    return detail::sadd_default(self, other, A {});
+                    return sadd(self, other, generic {});
                 }
             }
         }
@@ -1495,23 +1458,6 @@ namespace xsimd
         }
 
         // ssub
-        // TODO: move this in xsimd_generic
-        namespace detail
-        {
-            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-            inline batch<T, A> ssub_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
-            {
-                if (std::is_signed<T>::value)
-                {
-                    return sadd(self, -other);
-                }
-                else
-                {
-                    const auto diff = min(self, other);
-                    return self - diff;
-                }
-            }
-        }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
@@ -1528,7 +1474,7 @@ namespace xsimd
                 }
                 else
                 {
-                    return detail::ssub_default(self, other, A {});
+                    return ssub(self, other, generic {});
                 }
             }
             else
@@ -1543,7 +1489,7 @@ namespace xsimd
                 }
                 else
                 {
-                    return detail::ssub_default(self, other, A {});
+                    return ssub(self, other, generic {});
                 }
             }
         }

From 029aa9bfd70497128db41867b779bc05fdec473d Mon Sep 17 00:00:00 2001
From: Johan Mabille <johan.mabille@gmail.com>
Date: Sun, 29 Oct 2023 06:13:16 +0100
Subject: [PATCH 05/10] Fixed RTD build

---
 docs/environment.yml | 2 +-
 readthedocs.yml      | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/environment.yml b/docs/environment.yml
index 63b08b681..2dee10879 100644
--- a/docs/environment.yml
+++ b/docs/environment.yml
@@ -5,4 +5,4 @@ channels:
 
 dependencies:
   - breathe
-    #- docutils<0.17
+  - sphinx_rtd_theme
diff --git a/readthedocs.yml b/readthedocs.yml
index 02f0e7baa..38f414be8 100644
--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -1,2 +1,9 @@
+version: 2
+
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "mambaforge-22.9"
+
 conda:
-    file: docs/environment.yml
+  environment: docs/environment.yml

From f9dcafb5b502030e801d05af9beba018bdda8068 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <serge.guelton@telecom-bretagne.eu>
Date: Sun, 29 Oct 2023 22:48:07 +0100
Subject: [PATCH 06/10] Provide a generic version for float to uint32_t
 conversion, only if the int32 conversion is available

This remove duplicate code for various Intel implementation and should
also help for #962.
---
 .../arch/generic/xsimd_generic_details.hpp      | 17 +++++++++++++++++
 include/xsimd/arch/xsimd_avx.hpp                | 16 ----------------
 include/xsimd/arch/xsimd_avx2.hpp               | 15 ---------------
 include/xsimd/arch/xsimd_sse2.hpp               | 15 ---------------
 4 files changed, 17 insertions(+), 46 deletions(-)

diff --git a/include/xsimd/arch/generic/xsimd_generic_details.hpp b/include/xsimd/arch/generic/xsimd_generic_details.hpp
index 90d7643ee..3e8c764c6 100644
--- a/include/xsimd/arch/generic/xsimd_generic_details.hpp
+++ b/include/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -180,6 +180,23 @@ namespace xsimd
             {
                 return bitwise_cast<int64_t>(self);
             }
+
+            // Provide a generic uint32_t -> float cast only if we have a
+            // non-generic int32_t -> float fast_cast
+            template <class A, class _ = decltype(fast_cast(std::declval<batch<int32_t, A> const&>(), std::declval<batch<float, A> const&>(), A {}))>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                batch<uint32_t, A> msk_lo(0xFFFF);
+                batch<float, A> cnst65536f(65536.0f);
+
+                auto v_lo = batch_cast<int32_t>(v & msk_lo); /* extract the 16 lowest significant bits of self                             */
+                auto v_hi = batch_cast<int32_t>(v >> 16); /* 16 most significant bits of v                                                 */
+                auto v_lo_flt = batch_cast<float>(v_lo); /* No rounding                                                                */
+                auto v_hi_flt = batch_cast<float>(v_hi); /* No rounding                                                                */
+                v_hi_flt = cnst65536f * v_hi_flt; /* No rounding                                                            */
+                return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
+            }
         }
 
         namespace detail
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 76c297211..162e179ee 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -515,22 +515,6 @@ namespace xsimd
                 return _mm256_cvtepi32_ps(self);
             }
 
-            template <class A>
-            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx>) noexcept
-            {
-                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
-                // adapted to avx
-                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
-                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
-
-                __m256i v_lo = bitwise_and(batch<uint32_t, A>(v), batch<uint32_t, A>(msk_lo)); /* extract the 16 lowest significant bits of self                             */
-                __m256i v_hi = bitwise_rshift(batch<uint32_t, A>(v), 16, avx {}); /* 16 most significant bits of v                                                 */
-                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
-                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
-                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
-                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
-            }
-
             template <class A>
             inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
             {
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index 7403afe33..a5b07ec9d 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -279,21 +279,6 @@ namespace xsimd
         namespace detail
         {
 
-            template <class A>
-            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
-            {
-                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
-                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
-                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
-
-                __m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
-                __m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
-                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
-                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
-                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
-                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
-            }
-
             template <class A>
             inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
             {
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
index ec173f7c9..a2e035dc5 100644
--- a/include/xsimd/arch/xsimd_sse2.hpp
+++ b/include/xsimd/arch/xsimd_sse2.hpp
@@ -541,21 +541,6 @@ namespace xsimd
                 return _mm_cvtepi32_ps(self);
             }
 
-            template <class A>
-            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<sse2>) noexcept
-            {
-                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
-                __m128i msk_lo = _mm_set1_epi32(0xFFFF);
-                __m128 cnst65536f = _mm_set1_ps(65536.0f);
-
-                __m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
-                __m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
-                __m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding                                                                */
-                __m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding                                                                */
-                v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                            */
-                return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
-            }
-
             template <class A>
             inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
             {

From 0ba53ef543b3c41e0e5145abe7d8b3b4861a7748 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <serge.guelton@telecom-bretagne.eu>
Date: Tue, 31 Oct 2023 23:46:21 +0100
Subject: [PATCH 07/10] Provide a generic version for uint32_t to float
 conversion, only if the int32 conversion is available

This remove duplicate code for various Intel implementation and should
also help for #962.
---
 .../xsimd/arch/generic/xsimd_generic_details.hpp    | 13 +++++++++++++
 include/xsimd/arch/xsimd_avx.hpp                    | 11 -----------
 include/xsimd/arch/xsimd_sse2.hpp                   | 12 ------------
 include/xsimd/arch/xsimd_sse4_1.hpp                 | 11 -----------
 4 files changed, 13 insertions(+), 34 deletions(-)

diff --git a/include/xsimd/arch/generic/xsimd_generic_details.hpp b/include/xsimd/arch/generic/xsimd_generic_details.hpp
index 3e8c764c6..14c62a089 100644
--- a/include/xsimd/arch/generic/xsimd_generic_details.hpp
+++ b/include/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -197,6 +197,19 @@ namespace xsimd
                 v_hi_flt = cnst65536f * v_hi_flt; /* No rounding                                                            */
                 return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
             }
+
+            // Provide a generic float -> uint32_t cast only if we have a
+            // non-generic float -> int32_t fast_cast
+            template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+            {
+                auto is_large = v >= batch<float, A>(1u << 31);
+                auto small = bitwise_cast<float>(batch_cast<int32_t>(v));
+                auto large = bitwise_cast<float>(
+                    batch_cast<int32_t>(v - batch<float, A>(1u << 31))
+                    ^ batch<int32_t, A>(1u << 31));
+                return bitwise_cast<uint32_t>(select(is_large, large, small));
+            }
         }
 
         namespace detail
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 162e179ee..5ec1e02d4 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -520,17 +520,6 @@ namespace xsimd
             {
                 return _mm256_cvttps_epi32(self);
             }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx>) noexcept
-            {
-                return _mm256_castps_si256(
-                    _mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)),
-                                     _mm256_xor_ps(
-                                         _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))),
-                                         _mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))),
-                                     _mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ)));
-            }
         }
 
         // decr_if
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
index a2e035dc5..1639ba2bf 100644
--- a/include/xsimd/arch/xsimd_sse2.hpp
+++ b/include/xsimd/arch/xsimd_sse2.hpp
@@ -573,18 +573,6 @@ namespace xsimd
             {
                 return _mm_cvttps_epi32(self);
             }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept
-            {
-                __m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));
-                __m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));
-                __m128 rhs = _mm_castsi128_ps(_mm_xor_si128(
-                    _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
-                    _mm_set1_epi32(1u << 31)));
-                return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));
-            }
-
         }
 
         // eq
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
index c0e2878ef..165a191e4 100644
--- a/include/xsimd/arch/xsimd_sse4_1.hpp
+++ b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -65,17 +65,6 @@ namespace xsimd
                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
             }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
-            {
-                return _mm_castps_si128(
-                    _mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
-                                  _mm_castsi128_ps(_mm_xor_si128(
-                                      _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
-                                      _mm_set1_epi32(1u << 31))),
-                                  _mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
-            }
         }
 
         // eq

From 2eaa6ee9ff56f8ae13e4f6b7cb9d40ee9f84b6b6 Mon Sep 17 00:00:00 2001
From: anutosh491 <andersonbhat491@gmail.com>
Date: Wed, 25 Oct 2023 18:16:59 +0530
Subject: [PATCH 08/10] Implemented the following operations for the wasm
 instruction set: reduce_max, reduce_min, swizzle, shuffle, bitwise_cast,
 fast_cast

---
 include/xsimd/arch/xsimd_wasm.hpp | 186 +++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 6 deletions(-)

diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp
index f161120c6..32a5d67c8 100644
--- a/include/xsimd/arch/xsimd_wasm.hpp
+++ b/include/xsimd/arch/xsimd_wasm.hpp
@@ -19,11 +19,25 @@
 
 namespace xsimd
 {
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
 
     namespace kernel
     {
         using namespace types;
 
+        // fwd
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        template <class A, typename T, typename ITy, ITy... Indices>
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+
         // abs
         template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
         inline batch<T, A> abs(batch<T, A> const& self, requires_arch<wasm>) noexcept
@@ -136,6 +150,13 @@ namespace xsimd
             return wasm_i8x16_bitmask(self) != 0;
         }
 
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<wasm>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
         // bitwise_and
         template <class A, class T>
         inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
@@ -162,6 +183,13 @@ namespace xsimd
             return wasm_v128_andnot(self, other);
         }
 
+        // bitwise_cast
+        template <class A, class T, class Tp>
+        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<wasm>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+
         // bitwise_or
         template <class A, class T>
         inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
@@ -415,6 +443,53 @@ namespace xsimd
             return wasm_f64x2_eq(self, other);
         }
 
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<wasm>) noexcept
+            {
+                return wasm_f32x4_convert_i32x4(self);
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to wasm
+                v128_t xH = wasm_u64x2_shr(x, 32);
+                xH = wasm_v128_or(xH, wasm_f64x2_splat(19342813113834066795298816.)); //  2^84
+                v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); //  2^52
+                v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(19342813118337666422669312.)); //  2^84 + 2^52
+                return wasm_f64x2_add(f, xL);
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to wasm
+                v128_t xH = wasm_i32x4_shr(x, 16);
+                xH = wasm_v128_and(xH, wasm_i16x8_make(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = wasm_i64x2_add(xH, wasm_f64x2_splat(442721857769029238784.)); //  3*2^67
+                v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); //  2^52
+                v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(442726361368656609280.)); //  3*2^67 + 2^52
+                return wasm_f64x2_add(f, xL);
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<wasm>) noexcept
+            {
+                return wasm_i32x4_make(
+                    static_cast<int32_t>(wasm_f32x4_extract_lane(self, 0)),
+                    static_cast<int32_t>(wasm_f32x4_extract_lane(self, 1)),
+                    static_cast<int32_t>(wasm_f32x4_extract_lane(self, 2)),
+                    static_cast<int32_t>(wasm_f32x4_extract_lane(self, 3)));
+            }
+        }
+
         // floor
         template <class A>
         inline batch<float, A> floor(batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -516,11 +591,11 @@ namespace xsimd
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
-                return from_mask(batch_bool<float, A> {}, mask, wasm {});
+                return batch_bool_cast<T>(from_mask(batch_bool<float, A> {}, mask, wasm {}));
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
             {
-                return from_mask(batch_bool<double, A> {}, mask, wasm {});
+                return batch_bool_cast<T>(from_mask(batch_bool<double, A> {}, mask, wasm {}));
             }
         }
 
@@ -1039,6 +1114,44 @@ namespace xsimd
             return wasm_f64x2_extract_lane(tmp2, 0);
         }
 
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            batch<T, A> step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
+            batch<T, A> acc0 = max(self, step0);
+
+            batch<T, A> step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0);
+            batch<T, A> acc1 = max(acc0, step1);
+
+            batch<T, A> step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7);
+            batch<T, A> acc2 = max(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = max(acc2, step3);
+            return acc3.get(0);
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            batch<T, A> step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
+            batch<T, A> acc0 = min(self, step0);
+
+            batch<T, A> step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0);
+            batch<T, A> acc1 = min(acc0, step1);
+
+            batch<T, A> step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7);
+            batch<T, A> acc2 = min(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = min(acc2, step3);
+            return acc3.get(0);
+        }
+
         // rsqrt
         template <class A>
         inline batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -1144,6 +1257,33 @@ namespace xsimd
             return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
         }
 
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<wasm>) noexcept
+        {
+            // shuffle within lane
+            if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
+                return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
+
+            // shuffle within opposite lane
+            if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
+                return wasm_i32x4_shuffle(y, x, I0, I1, I2, I3);
+            return shuffle(x, y, mask, generic {});
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1>
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1> mask, requires_arch<wasm>) noexcept
+        {
+            // shuffle within lane
+            if (I0 < 2 && I1 >= 2)
+                return wasm_i64x2_shuffle(x, y, I0, I1);
+
+            // shuffle within opposite lane
+            if (I0 >= 2 && I1 < 2)
+                return wasm_i64x2_shuffle(y, x, I0, I1);
+            return shuffle(x, y, mask, generic {});
+        }
+
         // set
         template <class A, class... Values>
         inline batch<float, A> set(batch<float, A> const&, requires_arch<wasm>, Values... values) noexcept
@@ -1243,25 +1383,21 @@ namespace xsimd
         template <class A>
         inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
-            // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
             return wasm_v128_store(mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
         {
-            // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
             return wasm_v128_store((v128_t*)mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
         {
-            // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
             return wasm_v128_store((v128_t*)mem, self);
         }
         template <class A>
         inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
-            // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
             return wasm_v128_store(mem, self);
         }
 
@@ -1363,6 +1499,44 @@ namespace xsimd
             return wasm_f64x2_sqrt(val);
         }
 
+        // swizzle
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_shuffle(self, self, V0, V1);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, wasm {}));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), V0, V1, V2, V3);
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
+        }
+
         // trunc
         template <class A>
         inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept

From fc6c3fbd83831c6575c85208c875f1b24574cbfd Mon Sep 17 00:00:00 2001
From: Johan Mabille <johan.mabille@gmail.com>
Date: Thu, 2 Nov 2023 09:28:11 +0100
Subject: [PATCH 09/10] Fixed jQuery not loaded in RTD page

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index e0af0a0d7..4471fc992 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,7 @@
 def setup(app):
     app.add_css_file("main_stylesheet.css")
 
-extensions = ['breathe']
+extensions = ['breathe', 'sphinx_rtd_theme']
 breathe_projects = { 'xsimd': '../xml' }
 templates_path = ['_templates']
 html_static_path = ['_static']

From 54278ed4cae61bc782896ccbc722956fa7db027f Mon Sep 17 00:00:00 2001
From: Johan Mabille <johan.mabille@gmail.com>
Date: Thu, 2 Nov 2023 11:05:09 +0100
Subject: [PATCH 10/10] Updated to last Intel SDE

---
 .github/workflows/linux.yml | 2 +-
 install_sde.sh              | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 7b1631de7..044a7cb18 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -117,7 +117,7 @@ jobs:
         cd _build
         cd test
         if [[ '${{ matrix.sys.flags }}' == 'avx512' || '${{ matrix.sys.flags }}' == 'avx512cd' ]]; then
-          ../../sde-external-8.56.0-2020-07-05-lin/sde64 -skx -- ./test_xsimd
+          ../../sde-external-8.69.1-2021-07-18-lin/sde64 -skx -- ./test_xsimd
         else
           ./test_xsimd
         fi
diff --git a/install_sde.sh b/install_sde.sh
index 9e866316c..934b675e8 100644
--- a/install_sde.sh
+++ b/install_sde.sh
@@ -4,7 +4,7 @@
 #python ./intel-sde-downloader.py sde-external-8.35.0-2019-03-11-lin.tar.bz2
 #wget http://software.intel.com/content/dam/develop/external/us/en/protected/sde-external-8.50.0-2020-03-26-lin.tar.bz2
 
-wget --user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36" https://www.intel.com/content/dam/develop/external/us/en/documents/sde-external-8.56.0-2020-07-05-lin.tar.bz2
+wget --user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36" https://github.com/xtensor-stack/xsimd-testing-resources/releases/download/1.0.0/sde-external-8.69.1-2021-07-18-lin.tar.bz2
 
-tar xvf sde-external-8.56.0-2020-07-05-lin.tar.bz2
+tar xvf sde-external-8.69.1-2021-07-18-lin.tar.bz2
 sudo sh -c "echo 0 > /proc/sys/kernel/yama/ptrace_scope"