[core] Convert reference use intrinsic implementation for ChromeOS (#…

…26870) ### Details: - Add intrinsic implementation for convert reference ### Tickets: - CVS-152654
openvinotoolkit · Oct 4, 2024 · 817bc1a · 817bc1a
1 parent 36c9a43
commit 817bc1a
Show file tree

Hide file tree

Showing 8 changed files with 406 additions and 183 deletions.
diff --git a/src/core/reference/CMakeLists.txt b/src/core/reference/CMakeLists.txt
@@ -21,6 +21,18 @@ add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
 add_library(openvino::reference ALIAS ${TARGET_NAME})
 set_target_properties(${TARGET_NAME} PROPERTIES EXPORT_NAME reference)
 
+if(ENABLE_AVX2)
+    ov_avx2_optimization_flags(avx2_flags)
+
+    set(OV_REFERENCE_X86_AVX2_SRC
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/op/convert_x86_intrinsics.cpp
+    )
+    set_source_files_properties(${OV_REFERENCE_X86_AVX2_SRC} PROPERTIES COMPILE_OPTIONS "${avx2_flags}"
+                                                                        SKIP_UNITY_BUILD_INCLUSION ON
+                                                                        SKIP_PRECOMPILE_HEADERS ON)
+    target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_AVX2)
+endif()
+
 ov_build_target_faster(${TARGET_NAME}
     UNITY
     PCH PRIVATE "src/precomp.hpp")

diff --git a/src/core/reference/include/openvino/reference/convert.hpp b/src/core/reference/include/openvino/reference/convert.hpp
@@ -14,9 +14,11 @@
 #include "openvino/core/type/nf4.hpp"
 
 #if !defined(OS_CHROMEOS) && (defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64))
-#    define OV_CORE_USE_XBYAK_JIT 1
-#else
-#    define OV_CORE_USE_XBYAK_JIT 0
+#    define OV_CORE_USE_XBYAK_JIT
+#endif
+
+#if defined(OS_CHROMEOS) && defined(OPENVINO_ARCH_X86_64) && defined(HAVE_AVX2)
+#    define OV_CORE_USE_INTRINSICS
 #endif
 
 namespace ov {
@@ -33,12 +35,12 @@ namespace reference {
 namespace detail {
 
 template <typename TI, typename TO>
-typename std::enable_if<!std::is_same<TO, char>::value, TO>::type convert(const TI v) {
+constexpr typename std::enable_if<!std::is_same<TO, char>::value, TO>::type convert(const TI v) {
     return static_cast<TO>(v);
 }
 
 template <typename TI, typename TO>
-typename std::enable_if<std::is_same<TO, char>::value, TO>::type convert(const TI v) {
+constexpr typename std::enable_if<std::is_same<TO, char>::value, TO>::type convert(const TI v) {
     return static_cast<char>(static_cast<bool>(v));
 }
 }  // namespace detail
@@ -62,8 +64,6 @@ void convert(const TI* arg, TO* out, const size_t count) {
     std::transform(arg, arg + count, out, detail::convert<TI, TO>);
 }
 
-#if OV_CORE_USE_XBYAK_JIT
-
 template <>
 void convert<uint8_t, float16>(const uint8_t* arg, float16* out, size_t count);
 template <>
@@ -79,8 +79,6 @@ void convert<bfloat16, float16>(const bfloat16* arg, float16* out, size_t count)
 template <>
 void convert<bfloat16, float>(const bfloat16* arg, float* out, size_t count);
 
-#endif  // OV_CORE_USE_XBYAK_JIT
-
 template <>
 void convert<int32_t, float16>(const int32_t* arg, float16* out, size_t count);
 

diff --git a/src/core/reference/include/openvino/reference/utils/convert_util.hpp b/src/core/reference/include/openvino/reference/utils/convert_util.hpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <functional>
+#include <type_traits>
+
+#include "openvino/reference/convert.hpp"
+
+namespace ov {
+
+// forward declare from inference dev API (cannot be included)
+extern bool with_cpu_x86_avx2();
+
+namespace reference {
+
+struct NoClamp {
+    static constexpr bool enabled = false;
+
+    // Generic implementation
+    template <class T>
+    static constexpr T apply(const T v) {
+        return v;
+    }
+
+    // Specialize for optimization
+    template <class T, class R>
+    static R apply(const T v);
+};
+
+template <class TI, class TO>
+struct Clamp {
+    static constexpr bool enabled = true;
+
+    // Generic implementation
+    static constexpr TO apply(const TI v) {
+        return (v < std::numeric_limits<TO>::lowest())
+                   ? std::numeric_limits<TO>::lowest()
+                   : ((v > std::numeric_limits<TO>::max()) ? std::numeric_limits<TO>::max()
+                                                           : detail::convert<TI, TO>(v));
+    }
+
+    // Specialize for optimization
+    template <class T, class R>
+    static R apply(const T v);
+};
+
+template <class TI, class TO>
+struct Converter {
+    static constexpr size_t vec_f32_size = 32 / sizeof(float);
+
+    // Generic implementation to convert tail elements
+    template <class ClampMode>
+    static void tail(const TI* in, TO* out, size_t n) {
+        std::transform(in, in + n, out, [](const TI v) {
+            return detail::convert<decltype(ClampMode::apply(v)), TO>(ClampMode::apply(v));
+        });
+    }
+
+    // Helper struct to defined optimized version of conversion
+    template <class ClampMode>
+    struct Optimized {
+        static constexpr bool enabled = false;
+        static void run(const TI* in, TO* out) {}
+    };
+
+    // Generic implementation of conversion
+    template <class ClampMode, typename std::enable_if<!Optimized<ClampMode>::enabled>::type* = nullptr>
+    static void apply(const TI* in, TO* out, size_t n) {
+        return tail<ClampMode>(in, out, n);
+    }
+
+    // Enabled when Optimized struct specialized defined for optimization
+    template <class ClampMode, typename std::enable_if<Optimized<ClampMode>::enabled>::type* = nullptr>
+    static void apply(const TI* in, TO* out, size_t n) {
+        if (with_cpu_x86_avx2()) {
+            for (; n >= vec_f32_size; n -= vec_f32_size, in += vec_f32_size, out += vec_f32_size) {
+                Optimized<ClampMode>::run(in, out);
+            }
+        }
+        tail<ClampMode>(in, out, n);
+    }
+};
+
+}  // namespace reference
+}  // namespace ov
diff --git a/src/core/reference/include/openvino/reference/utils/convert_x86_intrinsics.hpp b/src/core/reference/include/openvino/reference/utils/convert_x86_intrinsics.hpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#ifdef OV_CORE_USE_INTRINSICS
+#    include <immintrin.h>
+
+#    include "openvino/reference/utils/convert_util.hpp"
+
+namespace ov {
+namespace reference {
+#    ifdef HAVE_AVX2
+
+// Clamp optimized specializations
+template <>
+__m128i NoClamp::apply<__m256i, __m128i>(const __m256i vec_i32);
+
+template <>
+template <>
+__m256 Clamp<float, float16>::apply<__m256, __m256>(const __m256 vec_f32);
+
+// Conversion optimized specializations
+// --- f32 -> other
+template <>
+template <>
+struct Converter<float, float16>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const float* in, float16* out);
+};
+
+template <>
+template <>
+struct Converter<float, float16>::Optimized<Clamp<float, float16>> {
+    static constexpr bool enabled = true;
+    static void run(const float* in, float16* out);
+};
+
+template <>
+template <>
+struct Converter<float, int8_t>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const float* in, int8_t* out);
+};
+
+// --- f16 -> other
+template <>
+template <>
+struct Converter<float16, float>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const float16* in, float* out);
+};
+
+template <>
+template <>
+struct Converter<float16, int8_t>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const float16* in, int8_t* out);
+};
+
+// --- bf16 -> other
+template <>
+template <>
+struct Converter<bfloat16, float16>::Optimized<Clamp<float, float16>> {
+    static constexpr bool enabled = true;
+    static void run(const bfloat16* in, float16* out);
+};
+
+template <>
+template <>
+struct Converter<bfloat16, float>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const bfloat16* in, float* out);
+};
+
+// --- u8 -> other
+template <>
+template <>
+struct Converter<uint8_t, float16>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const uint8_t* in, float16* out);
+};
+#    endif  // HAVE_AVX2
+}  // namespace reference
+}  // namespace ov
+#endif