diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 4282df652..b4dcf41ab 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -102,7 +102,7 @@ jobs: CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knm" fi if [[ '${{ matrix.sys.flags }}' == 'i386' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DCMAKE_CXX_FLAGS='-m32'" + CXX_FLAGS="$CXX_FLAGS -m32" fi if [[ '${{ matrix.sys.flags }}' == 'force_no_instr_set' ]]; then : @@ -110,6 +110,9 @@ jobs: CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DXSIMD_ENABLE_WERROR=ON" fi + # Cheap way of spotting uninitialized read + CXX_FLAGS="$CXX_FLAGS -ftrivial-auto-var-init=pattern" + mkdir _build cd _build cmake .. -DBUILD_TESTS=ON \ @@ -119,6 +122,7 @@ jobs: -DCMAKE_C_COMPILER=$CC \ -DCMAKE_CXX_COMPILER=$CXX \ $CMAKE_EXTRA_ARGS \ + -DCMAKE_CXX_FLAGS='$CXX_FLAGS' \ -G Ninja - name: Build run: ninja -C _build diff --git a/CMakeLists.txt b/CMakeLists.txt index 58b1e3498..61d880508 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,7 +139,7 @@ install(DIRECTORY ${XSIMD_INCLUDE_DIR}/xsimd DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". -set(XSIMD_CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" CACHE STRING "install path for xsimdConfig.cmake") +set(XSIMD_CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" CACHE STRING "install path for xsimdConfig.cmake") configure_package_config_file(${PROJECT_NAME}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" @@ -164,4 +164,4 @@ configure_file(${PROJECT_NAME}.pc.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" @ONLY) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") + DESTINATION "${CMAKE_INSTALL_DATADIR}/pkgconfig/") diff --git a/README.md b/README.md index 06759e5c1..87082f488 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ spack load xsimd You can directly install it from the sources with cmake: ```bash -cmake -D CMAKE_INSTALL_PREFIX=your_install_prefix +cmake -D CMAKE_INSTALL_PREFIX=your_install_prefix . make install ``` diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp index 4651ecdbb..18c9c80ad 100644 --- a/include/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -459,6 +459,7 @@ namespace xsimd XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { constexpr size_t bsize = sizeof...(Indices); + static_assert(bsize == batch::size, "valid shuffle"); // Detect common patterns XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...)) @@ -486,14 +487,15 @@ namespace xsimd return select(batch_bool_constant(), x, y); } -#if defined(__has_builtin) -#if __has_builtin(__builtin_shuffle_vector) -#define builtin_shuffle __builtin_shuffle_vector +#if defined(__has_builtin) && !defined(XSIMD_WITH_EMULATED) +#if __has_builtin(__builtin_shufflevector) +#define builtin_shuffle __builtin_shufflevector #endif #endif #if defined(builtin_shuffle) - return builtin_shuffle(x.data, y.data, Indices...); + typedef T vty __attribute__((__vector_size__(sizeof(batch)))); + return (typename batch::register_type)builtin_shuffle((vty)x.data, (vty)y.data, Indices...); // FIXME: my experiments show that GCC only correctly optimizes this builtin // starting at GCC 13, where it already has __builtin_shuffle_vector diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index f22089bac..8021fceb8 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -42,6 +42,10 @@ namespace xsimd #define ARCH_FIELD_EX(arch, field_name) \ unsigned field_name; \ XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; } + +#define ARCH_FIELD_EX_REUSE(arch, field_name) \ + XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; } + #define ARCH_FIELD(name) ARCH_FIELD_EX(name, name) ARCH_FIELD(sse2) @@ -72,8 +76,12 @@ namespace xsimd ARCH_FIELD(neon) ARCH_FIELD(neon64) ARCH_FIELD_EX(i8mm<::xsimd::neon64>, i8mm_neon64) - ARCH_FIELD(sve) - ARCH_FIELD(rvv) + ARCH_FIELD_EX(detail::sve<512>, sve) + ARCH_FIELD_EX_REUSE(detail::sve<256>, sve) + ARCH_FIELD_EX_REUSE(detail::sve<128>, sve) + ARCH_FIELD_EX(detail::rvv<512>, rvv) + ARCH_FIELD_EX_REUSE(detail::rvv<256>, rvv) + ARCH_FIELD_EX_REUSE(detail::rvv<128>, rvv) ARCH_FIELD(wasm) #undef ARCH_FIELD @@ -114,6 +122,35 @@ namespace xsimd #endif #elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) + + auto get_xcr0_low = []() noexcept + { + uint32_t xcr0; + +#if defined(_MSC_VER) && _MSC_VER >= 1400 + + xcr0 = (uint32_t)_xgetbv(0); + +#elif defined(__GNUC__) + + __asm__( + "xorl %%ecx, %%ecx\n" + "xgetbv\n" + : "=a"(xcr0) + : +#if defined(__i386__) + : "ecx", "edx" +#else + : "rcx", "rdx" +#endif + ); + +#else /* _MSC_VER < 1400 */ +#error "_MSC_VER < 1400 is not supported" +#endif /* _MSC_VER && _MSC_VER >= 1400 */ + return xcr0; + }; + auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept { @@ -148,19 +185,43 @@ namespace xsimd get_cpuid(regs1, 0x1); - sse2 = regs1[3] >> 26 & 1; - sse3 = regs1[2] >> 0 & 1; - ssse3 = regs1[2] >> 9 & 1; - sse4_1 = regs1[2] >> 19 & 1; - sse4_2 = regs1[2] >> 20 & 1; - fma3_sse42 = regs1[2] >> 12 & 1; + // OS can explicitly disable the usage of SSE/AVX extensions + // by setting an appropriate flag in CR0 register + // + // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html + + unsigned sse_state_os_enabled = 1; + unsigned avx_state_os_enabled = 1; + unsigned avx512_state_os_enabled = 1; + + // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit + // 18] to enable XSETBV/XGETBV instructions to access XCR0 and + // to support processor extended state management using + // XSAVE/XRSTOR. + bool osxsave = regs1[2] >> 27 & 1; + if (osxsave) + { + + uint32_t xcr0 = get_xcr0_low(); + + sse_state_os_enabled = xcr0 >> 1 & 1; + avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled; + avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled; + } + + sse2 = regs1[3] >> 26 & sse_state_os_enabled; + sse3 = regs1[2] >> 0 & sse_state_os_enabled; + ssse3 = regs1[2] >> 9 & sse_state_os_enabled; + sse4_1 = regs1[2] >> 19 & sse_state_os_enabled; + sse4_2 = regs1[2] >> 20 & sse_state_os_enabled; + fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled; - avx = regs1[2] >> 28 & 1; + avx = regs1[2] >> 28 & avx_state_os_enabled; fma3_avx = avx && fma3_sse42; int regs8[4]; get_cpuid(regs8, 0x80000001); - fma4 = regs8[2] >> 16 & 1; + fma4 = regs8[2] >> 16 & avx_state_os_enabled; // sse4a = regs[2] >> 6 & 1; @@ -168,23 +229,23 @@ namespace xsimd int regs7[4]; get_cpuid(regs7, 0x7); - avx2 = regs7[1] >> 5 & 1; + avx2 = regs7[1] >> 5 & avx_state_os_enabled; int regs7a[4]; get_cpuid(regs7a, 0x7, 0x1); - avxvnni = regs7a[0] >> 4 & 1; + avxvnni = regs7a[0] >> 4 & avx_state_os_enabled; fma3_avx2 = avx2 && fma3_sse42; - avx512f = regs7[1] >> 16 & 1; - avx512cd = regs7[1] >> 28 & 1; - avx512dq = regs7[1] >> 17 & 1; - avx512bw = regs7[1] >> 30 & 1; - avx512er = regs7[1] >> 27 & 1; - avx512pf = regs7[1] >> 26 & 1; - avx512ifma = regs7[1] >> 21 & 1; - avx512vbmi = regs7[2] >> 1 & 1; - avx512vnni_bw = regs7[2] >> 11 & 1; + avx512f = regs7[1] >> 16 & avx512_state_os_enabled; + avx512cd = regs7[1] >> 28 & avx512_state_os_enabled; + avx512dq = regs7[1] >> 17 & avx512_state_os_enabled; + avx512bw = regs7[1] >> 30 & avx512_state_os_enabled; + avx512er = regs7[1] >> 27 & avx512_state_os_enabled; + avx512pf = regs7[1] >> 26 & avx512_state_os_enabled; + avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled; + avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled; + avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled; avx512vnni_vbmi = avx512vbmi && avx512vnni_bw; #endif } diff --git a/test/test_arch.cpp b/test/test_arch.cpp index b42073358..f1f50d546 100644 --- a/test/test_arch.cpp +++ b/test/test_arch.cpp @@ -38,6 +38,16 @@ struct check_supported } }; +struct check_cpu_has_intruction_set +{ + template + void operator()(Arch arch) const + { + static_assert(std::is_same::value, + "cannot test instruction set availability on CPU"); + } +}; + struct check_available { template @@ -71,6 +81,11 @@ TEST_CASE("[multi arch support]") xsimd::supported_architectures::for_each(check_supported {}); } + SUBCASE("xsimd::available_architectures::has") + { + xsimd::all_architectures::for_each(check_cpu_has_intruction_set {}); + } + SUBCASE("xsimd::default_arch::name") { constexpr char const* name = xsimd::default_arch::name();