Skip to content

Commit

Permalink
Merge pull request #5 from xtensor-stack/master
Browse files Browse the repository at this point in the history
Pull latest changes from upstream
  • Loading branch information
Pencilcaseman authored Jul 23, 2024
2 parents 7882cef + e624857 commit dd930b6
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 29 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,17 @@ jobs:
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knm"
fi
if [[ '${{ matrix.sys.flags }}' == 'i386' ]]; then
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DCMAKE_CXX_FLAGS='-m32'"
CXX_FLAGS="$CXX_FLAGS -m32"
fi
if [[ '${{ matrix.sys.flags }}' == 'force_no_instr_set' ]]; then
:
else
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DXSIMD_ENABLE_WERROR=ON"
fi
# Cheap way of spotting uninitialized read
CXX_FLAGS="$CXX_FLAGS -ftrivial-auto-var-init=pattern"
mkdir _build
cd _build
cmake .. -DBUILD_TESTS=ON \
Expand All @@ -119,6 +122,7 @@ jobs:
-DCMAKE_C_COMPILER=$CC \
-DCMAKE_CXX_COMPILER=$CXX \
$CMAKE_EXTRA_ARGS \
-DCMAKE_CXX_FLAGS='$CXX_FLAGS' \
-G Ninja
- name: Build
run: ninja -C _build
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ install(DIRECTORY ${XSIMD_INCLUDE_DIR}/xsimd
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
set(XSIMD_CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" CACHE STRING "install path for xsimdConfig.cmake")
set(XSIMD_CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" CACHE STRING "install path for xsimdConfig.cmake")

configure_package_config_file(${PROJECT_NAME}Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
Expand All @@ -164,4 +164,4 @@ configure_file(${PROJECT_NAME}.pc.in
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc"
@ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/")
DESTINATION "${CMAKE_INSTALL_DATADIR}/pkgconfig/")
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ spack load xsimd
You can directly install it from the sources with cmake:

```bash
cmake -D CMAKE_INSTALL_PREFIX=your_install_prefix
cmake -D CMAKE_INSTALL_PREFIX=your_install_prefix .
make install
```

Expand Down
10 changes: 6 additions & 4 deletions include/xsimd/arch/generic/xsimd_generic_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@ namespace xsimd
XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
{
constexpr size_t bsize = sizeof...(Indices);
static_assert(bsize == batch<T, A>::size, "valid shuffle");

// Detect common patterns
XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
Expand Down Expand Up @@ -486,14 +487,15 @@ namespace xsimd
return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
}

#if defined(__has_builtin)
#if __has_builtin(__builtin_shuffle_vector)
#define builtin_shuffle __builtin_shuffle_vector
#if defined(__has_builtin) && !defined(XSIMD_WITH_EMULATED)
#if __has_builtin(__builtin_shufflevector)
#define builtin_shuffle __builtin_shufflevector
#endif
#endif

#if defined(builtin_shuffle)
return builtin_shuffle(x.data, y.data, Indices...);
typedef T vty __attribute__((__vector_size__(sizeof(batch<T, A>))));
return (typename batch<T, A>::register_type)builtin_shuffle((vty)x.data, (vty)y.data, Indices...);

// FIXME: my experiments show that GCC only correctly optimizes this builtin
// starting at GCC 13, where it already has __builtin_shuffle_vector
Expand Down
103 changes: 82 additions & 21 deletions include/xsimd/config/xsimd_cpuid.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ namespace xsimd
#define ARCH_FIELD_EX(arch, field_name) \
unsigned field_name; \
XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; }

#define ARCH_FIELD_EX_REUSE(arch, field_name) \
XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; }

#define ARCH_FIELD(name) ARCH_FIELD_EX(name, name)

ARCH_FIELD(sse2)
Expand Down Expand Up @@ -72,8 +76,12 @@ namespace xsimd
ARCH_FIELD(neon)
ARCH_FIELD(neon64)
ARCH_FIELD_EX(i8mm<::xsimd::neon64>, i8mm_neon64)
ARCH_FIELD(sve)
ARCH_FIELD(rvv)
ARCH_FIELD_EX(detail::sve<512>, sve)
ARCH_FIELD_EX_REUSE(detail::sve<256>, sve)
ARCH_FIELD_EX_REUSE(detail::sve<128>, sve)
ARCH_FIELD_EX(detail::rvv<512>, rvv)
ARCH_FIELD_EX_REUSE(detail::rvv<256>, rvv)
ARCH_FIELD_EX_REUSE(detail::rvv<128>, rvv)
ARCH_FIELD(wasm)

#undef ARCH_FIELD
Expand Down Expand Up @@ -114,6 +122,35 @@ namespace xsimd
#endif

#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)

auto get_xcr0_low = []() noexcept
{
uint32_t xcr0;

#if defined(_MSC_VER) && _MSC_VER >= 1400

xcr0 = (uint32_t)_xgetbv(0);

#elif defined(__GNUC__)

__asm__(
"xorl %%ecx, %%ecx\n"
"xgetbv\n"
: "=a"(xcr0)
:
#if defined(__i386__)
: "ecx", "edx"
#else
: "rcx", "rdx"
#endif
);

#else /* _MSC_VER < 1400 */
#error "_MSC_VER < 1400 is not supported"
#endif /* _MSC_VER && _MSC_VER >= 1400 */
return xcr0;
};

auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
{

Expand Down Expand Up @@ -148,43 +185,67 @@ namespace xsimd

get_cpuid(regs1, 0x1);

sse2 = regs1[3] >> 26 & 1;
sse3 = regs1[2] >> 0 & 1;
ssse3 = regs1[2] >> 9 & 1;
sse4_1 = regs1[2] >> 19 & 1;
sse4_2 = regs1[2] >> 20 & 1;
fma3_sse42 = regs1[2] >> 12 & 1;
// OS can explicitly disable the usage of SSE/AVX extensions
// by setting an appropriate flag in CR0 register
//
// https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html

unsigned sse_state_os_enabled = 1;
unsigned avx_state_os_enabled = 1;
unsigned avx512_state_os_enabled = 1;

// OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit
// 18] to enable XSETBV/XGETBV instructions to access XCR0 and
// to support processor extended state management using
// XSAVE/XRSTOR.
bool osxsave = regs1[2] >> 27 & 1;
if (osxsave)
{

uint32_t xcr0 = get_xcr0_low();

sse_state_os_enabled = xcr0 >> 1 & 1;
avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled;
avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled;
}

sse2 = regs1[3] >> 26 & sse_state_os_enabled;
sse3 = regs1[2] >> 0 & sse_state_os_enabled;
ssse3 = regs1[2] >> 9 & sse_state_os_enabled;
sse4_1 = regs1[2] >> 19 & sse_state_os_enabled;
sse4_2 = regs1[2] >> 20 & sse_state_os_enabled;
fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled;

avx = regs1[2] >> 28 & 1;
avx = regs1[2] >> 28 & avx_state_os_enabled;
fma3_avx = avx && fma3_sse42;

int regs8[4];
get_cpuid(regs8, 0x80000001);
fma4 = regs8[2] >> 16 & 1;
fma4 = regs8[2] >> 16 & avx_state_os_enabled;

// sse4a = regs[2] >> 6 & 1;

// xop = regs[2] >> 11 & 1;

int regs7[4];
get_cpuid(regs7, 0x7);
avx2 = regs7[1] >> 5 & 1;
avx2 = regs7[1] >> 5 & avx_state_os_enabled;

int regs7a[4];
get_cpuid(regs7a, 0x7, 0x1);
avxvnni = regs7a[0] >> 4 & 1;
avxvnni = regs7a[0] >> 4 & avx_state_os_enabled;

fma3_avx2 = avx2 && fma3_sse42;

avx512f = regs7[1] >> 16 & 1;
avx512cd = regs7[1] >> 28 & 1;
avx512dq = regs7[1] >> 17 & 1;
avx512bw = regs7[1] >> 30 & 1;
avx512er = regs7[1] >> 27 & 1;
avx512pf = regs7[1] >> 26 & 1;
avx512ifma = regs7[1] >> 21 & 1;
avx512vbmi = regs7[2] >> 1 & 1;
avx512vnni_bw = regs7[2] >> 11 & 1;
avx512f = regs7[1] >> 16 & avx512_state_os_enabled;
avx512cd = regs7[1] >> 28 & avx512_state_os_enabled;
avx512dq = regs7[1] >> 17 & avx512_state_os_enabled;
avx512bw = regs7[1] >> 30 & avx512_state_os_enabled;
avx512er = regs7[1] >> 27 & avx512_state_os_enabled;
avx512pf = regs7[1] >> 26 & avx512_state_os_enabled;
avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled;
avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled;
avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled;
avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
#endif
}
Expand Down
15 changes: 15 additions & 0 deletions test/test_arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ struct check_supported
}
};

struct check_cpu_has_intruction_set
{
template <class Arch>
void operator()(Arch arch) const
{
static_assert(std::is_same<decltype(xsimd::available_architectures().has(arch)), bool>::value,
"cannot test instruction set availability on CPU");
}
};

struct check_available
{
template <class Arch>
Expand Down Expand Up @@ -71,6 +81,11 @@ TEST_CASE("[multi arch support]")
xsimd::supported_architectures::for_each(check_supported {});
}

SUBCASE("xsimd::available_architectures::has")
{
xsimd::all_architectures::for_each(check_cpu_has_intruction_set {});
}

SUBCASE("xsimd::default_arch::name")
{
constexpr char const* name = xsimd::default_arch::name();
Expand Down

0 comments on commit dd930b6

Please sign in to comment.