Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pull upstream #3

Merged
merged 17 commits into from
Nov 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
e92cd86
Initial Implementation for the new WASM based instruction set
anutosh491 Oct 9, 2023
6352536
Added the following operations through direct intrinsics
anutosh491 Oct 9, 2023
1402e0e
Added the following operation through emulations
anutosh491 Oct 18, 2023
f27bbb8
Merge pull request #952 from anutosh491/Introducing_wasm_based_IS
JohanMabille Oct 25, 2023
2898acf
Implemented sadd, ssub & reduce_add through the generic implementatio…
anutosh491 Oct 25, 2023
e07a7eb
Merge pull request #960 from anutosh491/address_todo
JohanMabille Oct 25, 2023
029aa9b
Fixed RTD build
JohanMabille Oct 29, 2023
f9dcafb
Provide a generic version for float to uint32_t conversion, only if t…
serge-sans-paille Oct 29, 2023
011d355
Merge pull request #963 from xtensor-stack/feature/syndicate-fast-cas…
JohanMabille Oct 31, 2023
0ba53ef
Provide a generic version for uint32_t to float conversion, only if t…
serge-sans-paille Oct 31, 2023
eefd19c
Merge pull request #964 from xtensor-stack/feature/syndicate-fast-cas…
JohanMabille Nov 1, 2023
2eaa6ee
Implemented the following operations for the wasm instruction set:
anutosh491 Oct 25, 2023
fc6c3fb
Fixed jQuery not loaded in RTD page
JohanMabille Nov 2, 2023
54278ed
Updated to last Intel SDE
JohanMabille Nov 2, 2023
46c561b
Merge pull request #966 from JohanMabille/update_sde
JohanMabille Nov 2, 2023
b816668
Merge branch 'xtensor-stack:master' into remaining_ops_impl
anutosh491 Nov 2, 2023
105658a
Merge pull request #962 from anutosh491/remaining_ops_impl
JohanMabille Nov 2, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ jobs:
cd _build
cd test
if [[ '${{ matrix.sys.flags }}' == 'avx512' || '${{ matrix.sys.flags }}' == 'avx512cd' ]]; then
../../sde-external-8.56.0-2020-07-05-lin/sde64 -skx -- ./test_xsimd
../../sde-external-8.69.1-2021-07-18-lin/sde64 -skx -- ./test_xsimd
else
./test_xsimd
fi
2 changes: 1 addition & 1 deletion docs/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ channels:

dependencies:
- breathe
#- docutils<0.17
- sphinx_rtd_theme
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def setup(app):
app.add_css_file("main_stylesheet.css")

extensions = ['breathe']
extensions = ['breathe', 'sphinx_rtd_theme']
breathe_projects = { 'xsimd': '../xml' }
templates_path = ['_templates']
html_static_path = ['_static']
Expand Down
44 changes: 44 additions & 0 deletions include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,20 @@ namespace xsimd
return { res_r, res_i };
}

// hadd
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
inline T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept
{
alignas(A::alignment()) T buffer[batch<T, A>::size];
self.store_aligned(buffer);
T res = 0;
for (T val : buffer)
{
res += val;
}
return res;
}

// incr
template <class A, class T>
inline batch<T, A> incr(batch<T, A> const& self, requires_arch<generic>) noexcept
Expand Down Expand Up @@ -172,6 +186,23 @@ namespace xsimd
{
return add(self, other); // no saturated arithmetic on floating point numbers
}
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
if (std::is_signed<T>::value)
{
auto mask = (other >> (8 * sizeof(T) - 1));
auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
}
else
{
const auto diffmax = std::numeric_limits<T>::max() - self;
const auto mindiff = min(diffmax, other);
return self + mindiff;
}
}
template <class A>
inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
{
Expand All @@ -184,6 +215,19 @@ namespace xsimd
{
return sub(self, other); // no saturated arithmetic on floating point numbers
}
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
if (std::is_signed<T>::value)
{
return sadd(self, -other);
}
else
{
const auto diff = min(self, other);
return self - diff;
}
}
template <class A>
inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
{
Expand Down
30 changes: 30 additions & 0 deletions include/xsimd/arch/generic/xsimd_generic_details.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,36 @@ namespace xsimd
{
return bitwise_cast<int64_t>(self);
}

// Provide a generic uint32_t -> float cast only if we have a
// non-generic int32_t -> float fast_cast
template <class A, class _ = decltype(fast_cast(std::declval<batch<int32_t, A> const&>(), std::declval<batch<float, A> const&>(), A {}))>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
{
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
batch<uint32_t, A> msk_lo(0xFFFF);
batch<float, A> cnst65536f(65536.0f);

auto v_lo = batch_cast<int32_t>(v & msk_lo); /* extract the 16 lowest significant bits of self */
auto v_hi = batch_cast<int32_t>(v >> 16); /* 16 most significant bits of v */
auto v_lo_flt = batch_cast<float>(v_lo); /* No rounding */
auto v_hi_flt = batch_cast<float>(v_hi); /* No rounding */
v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */
return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}

// Provide a generic float -> uint32_t cast only if we have a
// non-generic float -> int32_t fast_cast
template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
inline batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
{
auto is_large = v >= batch<float, A>(1u << 31);
auto small = bitwise_cast<float>(batch_cast<int32_t>(v));
auto large = bitwise_cast<float>(
batch_cast<int32_t>(v - batch<float, A>(1u << 31))
^ batch<int32_t, A>(1u << 31));
return bitwise_cast<uint32_t>(select(is_large, large, small));
}
}

namespace detail
Expand Down
27 changes: 0 additions & 27 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,38 +515,11 @@ namespace xsimd
return _mm256_cvtepi32_ps(self);
}

template <class A>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx>) noexcept
{
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
// adapted to avx
__m256i msk_lo = _mm256_set1_epi32(0xFFFF);
__m256 cnst65536f = _mm256_set1_ps(65536.0f);

__m256i v_lo = bitwise_and(batch<uint32_t, A>(v), batch<uint32_t, A>(msk_lo)); /* extract the 16 lowest significant bits of self */
__m256i v_hi = bitwise_rshift(batch<uint32_t, A>(v), 16, avx {}); /* 16 most significant bits of v */
__m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */
__m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */
v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}

template <class A>
inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
{
return _mm256_cvttps_epi32(self);
}

template <class A>
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx>) noexcept
{
return _mm256_castps_si256(
_mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)),
_mm256_xor_ps(
_mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))),
_mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))),
_mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ)));
}
}

// decr_if
Expand Down
15 changes: 0 additions & 15 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,21 +279,6 @@ namespace xsimd
namespace detail
{

template <class A>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
{
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
__m256i msk_lo = _mm256_set1_epi32(0xFFFF);
__m256 cnst65536f = _mm256_set1_ps(65536.0f);

__m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self */
__m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v */
__m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */
__m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */
v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}

template <class A>
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
{
Expand Down
6 changes: 6 additions & 0 deletions include/xsimd/arch/xsimd_generic_fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ namespace xsimd
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept;

}
}
Expand Down
4 changes: 4 additions & 0 deletions include/xsimd/arch/xsimd_isa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@
#include "./xsimd_sve.hpp"
#endif

#if XSIMD_WITH_WASM
#include "./xsimd_wasm.hpp"
#endif

// Must come last to have access to all conversion specializations.
#include "./xsimd_generic.hpp"

Expand Down
93 changes: 6 additions & 87 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,21 +541,6 @@ namespace xsimd
return _mm_cvtepi32_ps(self);
}

template <class A>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<sse2>) noexcept
{
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
__m128i msk_lo = _mm_set1_epi32(0xFFFF);
__m128 cnst65536f = _mm_set1_ps(65536.0f);

__m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self */
__m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v */
__m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding */
__m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding */
v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}

template <class A>
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
{
Expand Down Expand Up @@ -588,18 +573,6 @@ namespace xsimd
{
return _mm_cvttps_epi32(self);
}

template <class A>
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept
{
__m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));
__m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));
__m128 rhs = _mm_castsi128_ps(_mm_xor_si128(
_mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
_mm_set1_epi32(1u << 31)));
return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));
}

}

// eq
Expand Down Expand Up @@ -1237,22 +1210,7 @@ namespace xsimd
batch<T, A> acc3 = min(acc2, step3);
return acc3.get(0);
}
// TODO: move this in xsimd_generic
namespace detail
{
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline T hadd_default(batch<T, A> const& self, requires_arch<sse2>) noexcept
{
alignas(A::alignment()) T buffer[batch<T, A>::size];
self.store_aligned(buffer);
T res = 0;
for (T val : buffer)
{
res += val;
}
return res;
}
}

template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
{
Expand Down Expand Up @@ -1280,7 +1238,7 @@ namespace xsimd
}
else
{
return detail::hadd_default(self, A {});
return hadd(self, generic {});
}
}
template <class A>
Expand Down Expand Up @@ -1381,28 +1339,6 @@ namespace xsimd

// sadd

// TODO: move this in xsimd_generic
namespace detail
{
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> sadd_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
{
if (std::is_signed<T>::value)
{
auto mask = (other >> (8 * sizeof(T) - 1));
auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
}
else
{
const auto diffmax = std::numeric_limits<T>::max() - self;
const auto mindiff = min(diffmax, other);
return self + mindiff;
}
}
}

template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
{
Expand All @@ -1418,7 +1354,7 @@ namespace xsimd
}
else
{
return detail::sadd_default(self, other, A {});
return sadd(self, other, generic {});
}
}
else
Expand All @@ -1433,7 +1369,7 @@ namespace xsimd
}
else
{
return detail::sadd_default(self, other, A {});
return sadd(self, other, generic {});
}
}
}
Expand Down Expand Up @@ -1495,23 +1431,6 @@ namespace xsimd
}

// ssub
// TODO: move this in xsimd_generic
namespace detail
{
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> ssub_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
{
if (std::is_signed<T>::value)
{
return sadd(self, -other);
}
else
{
const auto diff = min(self, other);
return self - diff;
}
}
}

template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
Expand All @@ -1528,7 +1447,7 @@ namespace xsimd
}
else
{
return detail::ssub_default(self, other, A {});
return ssub(self, other, generic {});
}
}
else
Expand All @@ -1543,7 +1462,7 @@ namespace xsimd
}
else
{
return detail::ssub_default(self, other, A {});
return ssub(self, other, generic {});
}
}
}
Expand Down
11 changes: 0 additions & 11 deletions include/xsimd/arch/xsimd_sse4_1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,6 @@ namespace xsimd
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}

template <class A>
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
{
return _mm_castps_si128(
_mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
_mm_castsi128_ps(_mm_xor_si128(
_mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
_mm_set1_epi32(1u << 31))),
_mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
}
}

// eq
Expand Down
Loading
Loading