serge-sans-paille · serge-sans-paille · Dec 21, 2023 · Dec 20, 2023
diff --git a/pythran/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/pythran/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_GENERIC_ARITHMETIC_HPP
 
 #include <complex>
+#include <limits>
 #include <type_traits>
 
 #include "./xsimd_generic_details.hpp"
@@ -126,6 +127,20 @@ namespace xsimd
             return { res_r, res_i };
         }
 
+        // hadd
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(buffer);
+            T res = 0;
+            for (T val : buffer)
+            {
+                res += val;
+            }
+            return res;
+        }
+
         // incr
         template <class A, class T>
         inline batch<T, A> incr(batch<T, A> const& self, requires_arch<generic>) noexcept
@@ -149,12 +164,45 @@ namespace xsimd
                                  self, other);
         }
 
+        // rotl
+        template <class A, class T, class STy>
+        inline batch<T, A> rotl(batch<T, A> const& self, STy other, requires_arch<generic>) noexcept
+        {
+            constexpr auto N = std::numeric_limits<T>::digits;
+            return (self << other) | (self >> (N - other));
+        }
+
+        // rotr
+        template <class A, class T, class STy>
+        inline batch<T, A> rotr(batch<T, A> const& self, STy other, requires_arch<generic>) noexcept
+        {
+            constexpr auto N = std::numeric_limits<T>::digits;
+            return (self >> other) | (self << (N - other));
+        }
+
         // sadd
         template <class A>
         inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
         {
             return add(self, other); // no saturated arithmetic on floating point numbers
         }
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = (other >> (8 * sizeof(T) - 1));
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
         template <class A>
         inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
         {
@@ -167,6 +215,19 @@ namespace xsimd
         {
             return sub(self, other); // no saturated arithmetic on floating point numbers
         }
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
         template <class A>
         inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
         {

diff --git a/pythran/xsimd/arch/generic/xsimd_generic_complex.hpp b/pythran/xsimd/arch/generic/xsimd_generic_complex.hpp
@@ -90,6 +90,18 @@ namespace xsimd
         {
             return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
         }
+
+        template <class A, class T>
+        inline batch_bool<T, A> isinf(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(isinf(self.real()) || isinf(self.imag()));
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> isfinite(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(isfinite(self.real()) && isfinite(self.imag()));
+        }
     }
 }
 

diff --git a/pythran/xsimd/arch/generic/xsimd_generic_details.hpp b/pythran/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -59,7 +59,9 @@ namespace xsimd
     template <class T, class A>
     inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch_bool<T, A> isinf(batch<T, A> const& self) noexcept;
+    inline typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& self) noexcept;
     template <class T, class A>
     inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
     template <class T, class A>
@@ -178,6 +180,36 @@ namespace xsimd
             {
                 return bitwise_cast<int64_t>(self);
             }
+
+            // Provide a generic uint32_t -> float cast only if we have a
+            // non-generic int32_t -> float fast_cast
+            template <class A, class _ = decltype(fast_cast(std::declval<batch<int32_t, A> const&>(), std::declval<batch<float, A> const&>(), A {}))>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                batch<uint32_t, A> msk_lo(0xFFFF);
+                batch<float, A> cnst65536f(65536.0f);
+
+                auto v_lo = batch_cast<int32_t>(v & msk_lo); /* extract the 16 lowest significant bits of self                             */
+                auto v_hi = batch_cast<int32_t>(v >> 16); /* 16 most significant bits of v                                                 */
+                auto v_lo_flt = batch_cast<float>(v_lo); /* No rounding                                                                */
+                auto v_hi_flt = batch_cast<float>(v_hi); /* No rounding                                                                */
+                v_hi_flt = cnst65536f * v_hi_flt; /* No rounding                                                            */
+                return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
+            }
+
+            // Provide a generic float -> uint32_t cast only if we have a
+            // non-generic float -> int32_t fast_cast
+            template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+            {
+                auto is_large = v >= batch<float, A>(1u << 31);
+                auto small = bitwise_cast<float>(batch_cast<int32_t>(v));
+                auto large = bitwise_cast<float>(
+                    batch_cast<int32_t>(v - batch<float, A>(1u << 31))
+                    ^ batch<int32_t, A>(1u << 31));
+                return bitwise_cast<uint32_t>(select(is_large, large, small));
+            }
         }
 
         namespace detail

diff --git a/pythran/xsimd/arch/generic/xsimd_generic_math.hpp b/pythran/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -95,12 +95,12 @@ namespace xsimd
         template <class A>
         inline batch<float, A> bitofsign(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
-            return self & constants::minuszero<batch<float, A>>();
+            return self & constants::signmask<batch<float, A>>();
         }
         template <class A>
         inline batch<double, A> bitofsign(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
-            return self & constants::minuszero<batch<double, A>>();
+            return self & constants::signmask<batch<double, A>>();
         }
 
         // bitwise_cast
@@ -470,16 +470,18 @@ namespace xsimd
             batch_type x = abs(self);
             auto test0 = self < batch_type(0.);
             batch_type r1(0.);
+            auto test1 = 3.f * x < 2.f;
             batch_type z = x / (batch_type(1.) + x);
-            if (any(3.f * x < 2.f))
+            if (any(test1))
             {
                 r1 = detail::erf_kernel<batch_type>::erfc3(z);
+                if (all(test1))
+                    return select(test0, batch_type(2.) - r1, r1);
             }
-            else
-            {
-                z -= batch_type(0.4f);
-                r1 = exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
-            }
+
+            z -= batch_type(0.4f);
+            batch_type r2 = exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
+            r1 = select(test1, r1, r2);
 #ifndef XSIMD_NO_INFINITIES
             r1 = select(x == constants::infinity<batch_type>(), batch_type(0.), r1);
 #endif
@@ -1849,7 +1851,7 @@ namespace xsimd
         {
             using U = as_integer_t<float>;
             return kernel::detail::apply_transform<U>([](float x) noexcept -> U
-                                                      { return std::lroundf(x); },
+                                                      { return std::nearbyintf(x); },
                                                       self);
         }
 
@@ -1859,7 +1861,7 @@ namespace xsimd
         {
             using U = as_integer_t<double>;
             return kernel::detail::apply_transform<U>([](double x) noexcept -> U
-                                                      { return std::llround(x); },
+                                                      { return std::nearbyint(x); },
                                                       self);
         }