diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp index f161120c6..32a5d67c8 100644 --- a/include/xsimd/arch/xsimd_wasm.hpp +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -19,11 +19,25 @@ namespace xsimd { + template + struct batch_bool_constant; + + template + inline batch bitwise_cast(batch const& x) noexcept; + + template + struct batch_constant; namespace kernel { using namespace types; + // fwd + template + inline batch insert(batch const& self, T val, index, requires_arch) noexcept; + template + inline batch shuffle(batch const& x, batch const& y, batch_constant, Indices...>, requires_arch) noexcept; + // abs template ::value && std::is_signed::value, void>::type> inline batch abs(batch const& self, requires_arch) noexcept @@ -136,6 +150,13 @@ namespace xsimd return wasm_i8x16_bitmask(self) != 0; } + // batch_bool_cast + template + inline batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return { bitwise_cast(batch(self.data)).data }; + } + // bitwise_and template inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept @@ -162,6 +183,13 @@ namespace xsimd return wasm_v128_andnot(self, other); } + // bitwise_cast + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return batch(self.data); + } + // bitwise_or template inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept @@ -415,6 +443,53 @@ namespace xsimd return wasm_f64x2_eq(self, other); } + // fast_cast + namespace detail + { + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return wasm_f32x4_convert_i32x4(self); + } + + template + inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to wasm + v128_t xH = wasm_u64x2_shr(x, 32); + xH = wasm_v128_or(xH, wasm_f64x2_splat(19342813113834066795298816.)); // 2^84 + v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); + v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52 + v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(19342813118337666422669312.)); // 2^84 + 2^52 + return wasm_f64x2_add(f, xL); + } + + template + inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to wasm + v128_t xH = wasm_i32x4_shr(x, 16); + xH = wasm_v128_and(xH, wasm_i16x8_make(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); + xH = wasm_i64x2_add(xH, wasm_f64x2_splat(442721857769029238784.)); // 3*2^67 + v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); + v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52 + v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(442726361368656609280.)); // 3*2^67 + 2^52 + return wasm_f64x2_add(f, xL); + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return wasm_i32x4_make( + static_cast(wasm_f32x4_extract_lane(self, 0)), + static_cast(wasm_f32x4_extract_lane(self, 1)), + static_cast(wasm_f32x4_extract_lane(self, 2)), + static_cast(wasm_f32x4_extract_lane(self, 3))); + } + } + // floor template inline batch floor(batch const& self, requires_arch) noexcept @@ -516,11 +591,11 @@ namespace xsimd } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - return from_mask(batch_bool {}, mask, wasm {}); + return batch_bool_cast(from_mask(batch_bool {}, mask, wasm {})); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { - return from_mask(batch_bool {}, mask, wasm {}); + return batch_bool_cast(from_mask(batch_bool {}, mask, wasm {})); } } @@ -1039,6 +1114,44 @@ namespace xsimd return wasm_f64x2_extract_lane(tmp2, 0); } + // reduce_max + template ::type> + inline T reduce_max(batch const& self, requires_arch) noexcept + { + batch step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); + batch acc0 = max(self, step0); + + batch step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0); + batch acc1 = max(acc0, step1); + + batch step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7); + batch acc2 = max(acc1, step2); + if (sizeof(T) == 2) + return acc2.get(0); + batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); + batch acc3 = max(acc2, step3); + return acc3.get(0); + } + + // reduce_min + template ::type> + inline T reduce_min(batch const& self, requires_arch) noexcept + { + batch step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); + batch acc0 = min(self, step0); + + batch step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0); + batch acc1 = min(acc0, step1); + + batch step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7); + batch acc2 = min(acc1, step2); + if (sizeof(T) == 2) + return acc2.get(0); + batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); + batch acc3 = min(acc2, step3); + return acc3.get(0); + } + // rsqrt template inline batch rsqrt(batch const& self, requires_arch) noexcept @@ -1144,6 +1257,33 @@ namespace xsimd return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } + // shuffle + template + inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1, I2, I3> mask, requires_arch) noexcept + { + // shuffle within lane + if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4) + return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3); + + // shuffle within opposite lane + if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4) + return wasm_i32x4_shuffle(y, x, I0, I1, I2, I3); + return shuffle(x, y, mask, generic {}); + } + + template + inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1> mask, requires_arch) noexcept + { + // shuffle within lane + if (I0 < 2 && I1 >= 2) + return wasm_i64x2_shuffle(x, y, I0, I1); + + // shuffle within opposite lane + if (I0 >= 2 && I1 < 2) + return wasm_i64x2_shuffle(y, x, I0, I1); + return shuffle(x, y, mask, generic {}); + } + // set template inline batch set(batch const&, requires_arch, Values... values) noexcept @@ -1243,25 +1383,21 @@ namespace xsimd template inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. return wasm_v128_store(mem, self); } template ::value, void>::type> inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. return wasm_v128_store((v128_t*)mem, self); } template ::value, void>::type> inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. return wasm_v128_store((v128_t*)mem, self); } template inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept { - // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch. return wasm_v128_store(mem, self); } @@ -1363,6 +1499,44 @@ namespace xsimd return wasm_f64x2_sqrt(val); } + // swizzle + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1>, requires_arch) noexcept + { + return wasm_i64x2_shuffle(self, self, V0, V1); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1>, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept + { + return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), V0, V1, V2, V3); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); + } + // trunc template inline batch trunc(batch const& self, requires_arch) noexcept