diff --git a/src/core/reference/CMakeLists.txt b/src/core/reference/CMakeLists.txt index 791489b6e5171c..f7874964233cf5 100644 --- a/src/core/reference/CMakeLists.txt +++ b/src/core/reference/CMakeLists.txt @@ -21,6 +21,18 @@ add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS}) add_library(openvino::reference ALIAS ${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES EXPORT_NAME reference) +if(ENABLE_AVX2) + ov_avx2_optimization_flags(avx2_flags) + + set(OV_REFERENCE_X86_AVX2_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/src/op/convert_x86_intrinsics.cpp + ) + set_source_files_properties(${OV_REFERENCE_X86_AVX2_SRC} PROPERTIES COMPILE_OPTIONS "${avx2_flags}" + SKIP_UNITY_BUILD_INCLUSION ON + SKIP_PRECOMPILE_HEADERS ON) + target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_AVX2) +endif() + ov_build_target_faster(${TARGET_NAME} UNITY PCH PRIVATE "src/precomp.hpp") diff --git a/src/core/reference/include/openvino/reference/convert.hpp b/src/core/reference/include/openvino/reference/convert.hpp index 926af31dbf5130..efb1ec1ca21415 100644 --- a/src/core/reference/include/openvino/reference/convert.hpp +++ b/src/core/reference/include/openvino/reference/convert.hpp @@ -14,9 +14,11 @@ #include "openvino/core/type/nf4.hpp" #if !defined(OS_CHROMEOS) && (defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)) -# define OV_CORE_USE_XBYAK_JIT 1 -#else -# define OV_CORE_USE_XBYAK_JIT 0 +# define OV_CORE_USE_XBYAK_JIT +#endif + +#if defined(OS_CHROMEOS) && defined(OPENVINO_ARCH_X86_64) && defined(HAVE_AVX2) +# define OV_CORE_USE_INTRINSICS #endif namespace ov { @@ -33,12 +35,12 @@ namespace reference { namespace detail { template -typename std::enable_if::value, TO>::type convert(const TI v) { +constexpr typename std::enable_if::value, TO>::type convert(const TI v) { return static_cast(v); } template -typename std::enable_if::value, TO>::type convert(const TI v) { +constexpr typename std::enable_if::value, TO>::type convert(const TI v) { return static_cast(static_cast(v)); } } // namespace detail @@ -62,8 +64,6 @@ void convert(const TI* arg, TO* out, const size_t count) { std::transform(arg, arg + count, out, detail::convert); } -#if OV_CORE_USE_XBYAK_JIT - template <> void convert(const uint8_t* arg, float16* out, size_t count); template <> @@ -79,8 +79,6 @@ void convert(const bfloat16* arg, float16* out, size_t count) template <> void convert(const bfloat16* arg, float* out, size_t count); -#endif // OV_CORE_USE_XBYAK_JIT - template <> void convert(const int32_t* arg, float16* out, size_t count); diff --git a/src/core/reference/include/openvino/reference/utils/convert_util.hpp b/src/core/reference/include/openvino/reference/utils/convert_util.hpp new file mode 100644 index 00000000000000..3be10c9fd19fbb --- /dev/null +++ b/src/core/reference/include/openvino/reference/utils/convert_util.hpp @@ -0,0 +1,88 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "openvino/reference/convert.hpp" + +namespace ov { + +// forward declare from inference dev API (cannot be included) +extern bool with_cpu_x86_avx2(); + +namespace reference { + +struct NoClamp { + static constexpr bool enabled = false; + + // Generic implementation + template + static constexpr T apply(const T v) { + return v; + } + + // Specialize for optimization + template + static R apply(const T v); +}; + +template +struct Clamp { + static constexpr bool enabled = true; + + // Generic implementation + static constexpr TO apply(const TI v) { + return (v < std::numeric_limits::lowest()) + ? std::numeric_limits::lowest() + : ((v > std::numeric_limits::max()) ? std::numeric_limits::max() + : detail::convert(v)); + } + + // Specialize for optimization + template + static R apply(const T v); +}; + +template +struct Converter { + static constexpr size_t vec_f32_size = 32 / sizeof(float); + + // Generic implementation to convert tail elements + template + static void tail(const TI* in, TO* out, size_t n) { + std::transform(in, in + n, out, [](const TI v) { + return detail::convert(ClampMode::apply(v)); + }); + } + + // Helper struct to defined optimized version of conversion + template + struct Optimized { + static constexpr bool enabled = false; + static void run(const TI* in, TO* out) {} + }; + + // Generic implementation of conversion + template ::enabled>::type* = nullptr> + static void apply(const TI* in, TO* out, size_t n) { + return tail(in, out, n); + } + + // Enabled when Optimized struct specialized defined for optimization + template ::enabled>::type* = nullptr> + static void apply(const TI* in, TO* out, size_t n) { + if (with_cpu_x86_avx2()) { + for (; n >= vec_f32_size; n -= vec_f32_size, in += vec_f32_size, out += vec_f32_size) { + Optimized::run(in, out); + } + } + tail(in, out, n); + } +}; + +} // namespace reference +} // namespace ov diff --git a/src/core/reference/include/openvino/reference/utils/convert_x86_intrinsics.hpp b/src/core/reference/include/openvino/reference/utils/convert_x86_intrinsics.hpp new file mode 100644 index 00000000000000..1716f05ba6ac78 --- /dev/null +++ b/src/core/reference/include/openvino/reference/utils/convert_x86_intrinsics.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#ifdef OV_CORE_USE_INTRINSICS +# include + +# include "openvino/reference/utils/convert_util.hpp" + +namespace ov { +namespace reference { +# ifdef HAVE_AVX2 + +// Clamp optimized specializations +template <> +__m128i NoClamp::apply<__m256i, __m128i>(const __m256i vec_i32); + +template <> +template <> +__m256 Clamp::apply<__m256, __m256>(const __m256 vec_f32); + +// Conversion optimized specializations +// --- f32 -> other +template <> +template <> +struct Converter::Optimized { + static constexpr bool enabled = true; + static void run(const float* in, float16* out); +}; + +template <> +template <> +struct Converter::Optimized> { + static constexpr bool enabled = true; + static void run(const float* in, float16* out); +}; + +template <> +template <> +struct Converter::Optimized { + static constexpr bool enabled = true; + static void run(const float* in, int8_t* out); +}; + +// --- f16 -> other +template <> +template <> +struct Converter::Optimized { + static constexpr bool enabled = true; + static void run(const float16* in, float* out); +}; + +template <> +template <> +struct Converter::Optimized { + static constexpr bool enabled = true; + static void run(const float16* in, int8_t* out); +}; + +// --- bf16 -> other +template <> +template <> +struct Converter::Optimized> { + static constexpr bool enabled = true; + static void run(const bfloat16* in, float16* out); +}; + +template <> +template <> +struct Converter::Optimized { + static constexpr bool enabled = true; + static void run(const bfloat16* in, float* out); +}; + +// --- u8 -> other +template <> +template <> +struct Converter::Optimized { + static constexpr bool enabled = true; + static void run(const uint8_t* in, float16* out); +}; +# endif // HAVE_AVX2 +} // namespace reference +} // namespace ov +#endif diff --git a/src/core/reference/src/op/convert.cpp b/src/core/reference/src/op/convert.cpp index 18ad7a6f4717e5..5054121b5615c0 100644 --- a/src/core/reference/src/op/convert.cpp +++ b/src/core/reference/src/op/convert.cpp @@ -4,18 +4,23 @@ #include "openvino/reference/convert.hpp" -#if OV_CORE_USE_XBYAK_JIT +#include "openvino/reference/utils/convert_util.hpp" + +#ifdef OV_CORE_USE_XBYAK_JIT # include "jit_generator.hpp" +#endif -using namespace ov::runtime; -#endif // OV_CORE_USE_XBYAK_JIT +#ifdef OV_CORE_USE_INTRINSICS +# include "openvino/reference/utils/convert_x86_intrinsics.hpp" +#endif namespace ov { namespace reference { -#if OV_CORE_USE_XBYAK_JIT + namespace { +#ifdef OV_CORE_USE_XBYAK_JIT template -void jit_convert_vec(jit::Generator&, const Xbyak::RegExp&, const Xbyak::RegExp&); +void jit_convert_vec(jit::Generator&, const Xbyak::RegExp&, const Xbyak::RegExp&) {} template void jit_convert_vec_prepare(jit::Generator&) {} @@ -265,40 +270,6 @@ class jit_convert_array : public jit::Generator { } }; -template -void convert_impl(const TI* arg, TO* out, size_t count) { - auto converter = jit_convert_array::get(); - - if (converter) { - jit_convert_array::args_t args = {arg, out, count}; - converter(&args); - } else { - for (size_t i = 0; i < count; ++i) { - out[i] = static_cast(arg[i]); - } - } -} - -template <> -void convert_impl(const float* arg, float16* out, size_t count) { - auto converter = jit_convert_array::get(); - - if (converter) { - jit_convert_array::args_t args = {arg, out, count}; - converter(&args); - } else { - for (size_t i = 0; i < count; ++i) { - if (arg[i] > std::numeric_limits::max()) { - out[i] = std::numeric_limits::max(); - } else if (arg[i] < std::numeric_limits::lowest()) { - out[i] = std::numeric_limits::lowest(); - } else { - out[i] = static_cast(arg[i]); - } - } - } -} - template void jit_count_out_of_range_vec_prepare(jit::Generator&) {} @@ -504,114 +475,88 @@ class jit_count_out_of_range : public jit::Generator { } }; +#endif // OV_CORE_USE_XBYAK_JIT + +template +void convert_impl(const TI* arg, TO* out, size_t count) { +#ifdef OV_CORE_USE_XBYAK_JIT + if (auto converter = jit_convert_array::get()) { + jit_convert_array::args_t args = {arg, out, count}; + converter(&args); + } else +#endif + { + Converter::template apply(arg, out, count); + } +} } // namespace template <> void convert(const uint8_t* arg, float16* out, size_t count) { - convert_impl(arg, out, count); + convert_impl(arg, out, count); } template <> void convert(const float16* arg, float* out, size_t count) { - convert_impl(arg, out, count); + convert_impl(arg, out, count); } template <> void convert(const float* arg, float16* out, size_t count) { - convert_impl(arg, out, count); + convert_impl(arg, out, count); } template <> void convert(const float* arg, int8_t* out, size_t count) { - convert_impl(arg, out, count); + convert_impl(arg, out, count); } template <> void convert(const float16* arg, int8_t* out, size_t count) { - convert_impl(arg, out, count); + convert_impl(arg, out, count); } template <> void convert(const bfloat16* arg, float16* out, size_t count) { - convert_impl(arg, out, count); + convert_impl(arg, out, count); } template <> void convert(const bfloat16* arg, float* out, size_t count) { - convert_impl(arg, out, count); + convert_impl(arg, out, count); } -#endif // OV_CORE_USE_XBYAK_JIT - void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count) { -#if OV_CORE_USE_XBYAK_JIT - convert_impl(arg, out, count); -#else - // FIXME CVS-125496: duplicate and stub for ARM, provide optimized solution - for (size_t i = 0; i < count; ++i) { - if (arg[i] > std::numeric_limits::max()) { - out[i] = std::numeric_limits::max(); - } else if (arg[i] < std::numeric_limits::lowest()) { - out[i] = std::numeric_limits::lowest(); - } else { - out[i] = static_cast(arg[i]); - } - } -#endif // OV_CORE_USE_XBYAK_JIT + convert_impl>(arg, out, count); } template <> void convert(const int32_t* arg, float16* out, size_t count) { - for (size_t i = 0; i < count; ++i) { - if (arg[i] > std::numeric_limits::max()) { - out[i] = std::numeric_limits::max(); - } else if (arg[i] < std::numeric_limits::lowest()) { - out[i] = std::numeric_limits::lowest(); - } else { - out[i] = static_cast(arg[i]); - } - } + Converter::apply>(arg, out, count); } void convert_from_bf16_to_f16_with_clamp(const bfloat16* arg, float16* out, size_t count) { -#if OV_CORE_USE_XBYAK_JIT - convert_impl(arg, out, count); -#else + // can re-use Clamp as bf16 is converted to float before clamping + using clamp_bf16_f16 = Clamp; + convert_impl(arg, out, count); // FIXME CVS-125496: duplicate and stub for ARM, provide optimized solution - for (size_t i = 0; i < count; ++i) { - if (arg[i] > std::numeric_limits::max()) { - out[i] = std::numeric_limits::max(); - } else if (arg[i] < std::numeric_limits::lowest()) { - out[i] = std::numeric_limits::lowest(); - } else { - out[i] = static_cast(arg[i]); - } - } -#endif // OV_CORE_USE_XBYAK_JIT } size_t count_out_of_f16_range(const float* arg, size_t count) { - size_t num_out_of_range = 0; - -#if OV_CORE_USE_XBYAK_JIT - auto converter = jit_count_out_of_range::get(); - if (converter) { +#ifdef OV_CORE_USE_XBYAK_JIT + if (auto converter = jit_count_out_of_range::get()) { + size_t num_out_of_range = 0; jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count}; converter(&args); return num_out_of_range; } #endif // OV_CORE_USE_XBYAK_JIT - for (size_t i = 0; i < count; ++i) { - // if abs value is smaller than the smallest positive fp16, but not zero - if (std::abs(arg[i]) < ov::float16::from_bits(0x0001) && arg[i] != 0.0f) { - num_out_of_range++; - } else if (arg[i] > std::numeric_limits::max()) { - num_out_of_range++; - } else if (arg[i] < std::numeric_limits::lowest()) { - num_out_of_range++; - } - } - return num_out_of_range; + const auto is_out_of_f16_range = [](const float v) { + return (std::abs(v) < float16::from_bits(0x0001) && v != 0.0f) || (v > std::numeric_limits::max()) || + (v < std::numeric_limits::lowest()); + }; + + return std::count_if(arg, arg + count, is_out_of_f16_range); } } // namespace reference diff --git a/src/core/reference/src/op/convert_x86_intrinsics.cpp b/src/core/reference/src/op/convert_x86_intrinsics.cpp new file mode 100644 index 00000000000000..78957feaaef3c7 --- /dev/null +++ b/src/core/reference/src/op/convert_x86_intrinsics.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/reference/convert.hpp" +#if defined(OV_CORE_USE_INTRINSICS) +# include "openvino/reference/utils/convert_x86_intrinsics.hpp" + +namespace ov { +namespace reference { + +# if defined(HAVE_AVX2) +template <> +__m128i NoClamp::apply<__m256i, __m128i>(const __m256i vec_i32) { + // clang-format off + static const auto shuffle = _mm256_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); + // clang-format on + + const auto t = _mm256_shuffle_epi8(vec_i32, shuffle); + const auto low = _mm256_castsi256_si128(t); + const auto high = _mm256_extracti128_si256(t, 1); + return _mm_or_si128(low, high); +} + +template <> +template <> +__m256 Clamp::apply<__m256, __m256>(const __m256 vec_f32) { + static const auto lo = _mm256_set1_ps(std::numeric_limits::lowest()); + static const auto hi = _mm256_set1_ps(std::numeric_limits::max()); + + return _mm256_min_ps(_mm256_max_ps(vec_f32, lo), hi); +} + +// --- f32 -> other +void Converter::Optimized>::run(const float* in, float16* out) { + auto vec_f32 = _mm256_loadu_ps(in); // load f32 input + auto vec_f16 = _mm256_cvtps_ph(Clamp::apply<__m256, __m256>(vec_f32), 0); // f32 -> f16 with clamp + _mm_storeu_si128(reinterpret_cast<__m128i*>(out), vec_f16); // store f16 output +} + +void Converter::Optimized::run(const float* in, float16* out) { + auto vec_f32 = _mm256_loadu_ps(in); // load f32 input + auto vec_f16 = _mm256_cvtps_ph(vec_f32, _MM_ROUND_NEAREST); // f32 -> f16 + _mm_storeu_si128(reinterpret_cast<__m128i*>(out), vec_f16); // store f16 output +} + +void Converter::Optimized::run(const float* in, int8_t* out) { + auto vec_f32 = _mm256_load_ps(in); // load f32 input + auto vec_i32 = _mm256_cvttps_epi32(vec_f32); // f32 -> i32 + auto vec_i8 = NoClamp::template apply<__m256i, __m128i>(vec_i32); // i32 -> i8 no clamping + _mm_storeu_si64(out, vec_i8); // store i8 output +} + +// --- f16 -> other +void Converter::Optimized::run(const float16* in, float* out) { + auto vec_f16 = _mm_loadu_si128(reinterpret_cast(in)); // load input as f16 vector + auto vec_f32 = _mm256_cvtph_ps(vec_f16); // convert f16 -> f32 + _mm256_storeu_ps(out, vec_f32); // store f32 in output +} + +void Converter::Optimized::run(const float16* in, int8_t* out) { + const auto vec_f16 = _mm_loadu_si128(reinterpret_cast(in)); // load input as f16 vector + const auto vec_f32 = _mm256_cvtph_ps(vec_f16); // convert f16 -> f32 + auto vec_i32 = _mm256_cvttps_epi32(vec_f32); // f32 -> i32 + auto vec_i8 = NoClamp::apply<__m256i, __m128i>(vec_i32); // i32 -> i8 no clamp + _mm_storeu_si64(out, vec_i8); // store i8 output +} + +// --- bf16 -> other +void Converter::Optimized>::run(const bfloat16* in, float16* out) { + auto vec_bf16 = _mm256_cvtepu16_epi32(*reinterpret_cast(in)); // expand to 32-bits + auto vec_f32 = _mm256_castsi256_ps(_mm256_slli_epi32(vec_bf16, 16)); // shift left bf16 -> f32 + auto vec_f16 = + _mm256_cvtps_ph(Clamp::apply<__m256, __m256>(vec_f32), _MM_ROUND_NEAREST); // f32 -> f16 + _mm_storeu_si128(reinterpret_cast<__m128i*>(out), vec_f16); // store f16 +} + +void Converter::Optimized::run(const bfloat16* in, float* out) { + auto vec_f32 = _mm256_cvtepu16_epi32(*reinterpret_cast(in)); // expand to 32-bits + vec_f32 = _mm256_slli_epi32(vec_f32, 16); // shift left bf16 -> f32 + _mm256_storeu_ps(out, _mm256_castsi256_ps(vec_f32)); // store f32 in output +} + +// --- u8 -> other +void Converter::Optimized::run(const uint8_t* in, float16* out) { + auto i64 = _mm_loadu_si64(in); // load u8 input + auto vec_i32 = _mm256_cvtepu8_epi32(i64); // u8 -> i32 + auto vec_f32 = _mm256_cvtepi32_ps(vec_i32); // i32 -> f32 + auto vec_f16 = _mm256_cvtps_ph(vec_f32, _MM_ROUND_NEAREST); // f32 -> f16 + _mm_storeu_si128(reinterpret_cast<__m128i*>(out), vec_f16); // store f16 output +} +# endif // HAVE_AVX2 +} // namespace reference +} // namespace ov +#endif // OV_CORE_USE_INTRINSICS diff --git a/src/core/reference/src/op/jit_generator.cpp b/src/core/reference/src/op/jit_generator.cpp index d516d210e71967..7d7da06d5da8d5 100644 --- a/src/core/reference/src/op/jit_generator.cpp +++ b/src/core/reference/src/op/jit_generator.cpp @@ -16,7 +16,7 @@ # include "openvino/core/type/float16.hpp" namespace ov { -namespace runtime { +namespace reference { namespace jit { using namespace Xbyak; @@ -191,7 +191,7 @@ void Generator::copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, copy(dst, src, size); } } // namespace jit -} // namespace runtime +} // namespace reference } // namespace ov #endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 diff --git a/src/core/reference/src/op/jit_generator.hpp b/src/core/reference/src/op/jit_generator.hpp index 924d582d8bc8d4..b4b9cd7a60c23f 100644 --- a/src/core/reference/src/op/jit_generator.hpp +++ b/src/core/reference/src/op/jit_generator.hpp @@ -5,90 +5,87 @@ #pragma once #if defined _WIN32 && !defined NOMINMAX -#define NOMINMAX +# define NOMINMAX #endif -#include #include +#include + namespace ov { -namespace runtime { +namespace reference { namespace jit { #ifdef XBYAK64 - static const Xbyak::Operand::Code abi_save_gpr_regs[] = { - Xbyak::Operand::RBX, - Xbyak::Operand::RBP, - Xbyak::Operand::R12, - Xbyak::Operand::R13, - Xbyak::Operand::R14, - Xbyak::Operand::R15, -#ifdef _WIN32 - Xbyak::Operand::RDI, - Xbyak::Operand::RSI, -#endif - }; +static const Xbyak::Operand::Code abi_save_gpr_regs[] = { + Xbyak::Operand::RBX, + Xbyak::Operand::RBP, + Xbyak::Operand::R12, + Xbyak::Operand::R13, + Xbyak::Operand::R14, + Xbyak::Operand::R15, +# ifdef _WIN32 + Xbyak::Operand::RDI, + Xbyak::Operand::RSI, +# endif +}; -#ifdef _WIN32 -#define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX -#else -#define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI -#endif +# ifdef _WIN32 +# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX +# else +# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI +# endif #endif // XBYAK64 - class Generator : public Xbyak::CodeGenerator - { - static constexpr size_t xmm_len = 16; +class Generator : public Xbyak::CodeGenerator { + static constexpr size_t xmm_len = 16; #ifdef _WIN32 - static constexpr size_t xmm_to_preserve_start = 6; - static constexpr size_t xmm_to_preserve = 10; + static constexpr size_t xmm_to_preserve_start = 6; + static constexpr size_t xmm_to_preserve = 10; #else - static constexpr size_t xmm_to_preserve_start = 0; - static constexpr size_t xmm_to_preserve = 0; + static constexpr size_t xmm_to_preserve_start = 0; + static constexpr size_t xmm_to_preserve = 0; #endif - static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); - const size_t size_of_abi_save_regs; - - const Xbyak::Reg64 reg_EVEX_max_8b_offt; - static constexpr int EVEX_max_8b_offt = 0x200; - - public: - const Xbyak::Reg64 param = abi_param1; - - typedef enum - { - isa_any, - sse42, - avx, - avx2, - avx512_common, - avx512_core, - avx512_core_vnni, - avx512_mic, - avx512_mic_4ops, - avx512_core_bf16, - avx512_vpopcnt, - fp16 - } cpu_isa_t; - - static bool mayiuse(const cpu_isa_t cpu_isa); - static bool is_x64(); - - Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024); - void preamble(); - void postamble(); - - void foreach (const Xbyak::Reg64& idx, - size_t step, - const Xbyak::Reg64& end, - std::function && fn); - - template - void copy(const Xbyak::Reg64& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size); - }; - } - } - } // namespace ov + static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); + const size_t size_of_abi_save_regs; + + const Xbyak::Reg64 reg_EVEX_max_8b_offt; + static constexpr int EVEX_max_8b_offt = 0x200; + +public: + const Xbyak::Reg64 param = abi_param1; + + typedef enum { + isa_any, + sse42, + avx, + avx2, + avx512_common, + avx512_core, + avx512_core_vnni, + avx512_mic, + avx512_mic_4ops, + avx512_core_bf16, + avx512_vpopcnt, + fp16 + } cpu_isa_t; + + static bool mayiuse(const cpu_isa_t cpu_isa); + static bool is_x64(); + + Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024); + void preamble(); + void postamble(); + + void foreach (const Xbyak::Reg64& idx, + size_t step, + const Xbyak::Reg64& end, + std::function && fn); + + template + void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); +}; +} // namespace jit +} // namespace reference +} // namespace ov