diff --git a/src/core/reference/CMakeLists.txt b/src/core/reference/CMakeLists.txt
index 791489b6e5171c..f7874964233cf5 100644
--- a/src/core/reference/CMakeLists.txt
+++ b/src/core/reference/CMakeLists.txt
@@ -21,6 +21,18 @@ add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
 add_library(openvino::reference ALIAS ${TARGET_NAME})
 set_target_properties(${TARGET_NAME} PROPERTIES EXPORT_NAME reference)
 
+if(ENABLE_AVX2)
+    ov_avx2_optimization_flags(avx2_flags)
+
+    set(OV_REFERENCE_X86_AVX2_SRC
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/op/convert_x86_intrinsics.cpp
+    )
+    set_source_files_properties(${OV_REFERENCE_X86_AVX2_SRC} PROPERTIES COMPILE_OPTIONS "${avx2_flags}"
+                                                                        SKIP_UNITY_BUILD_INCLUSION ON
+                                                                        SKIP_PRECOMPILE_HEADERS ON)
+    target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_AVX2)
+endif()
+
 ov_build_target_faster(${TARGET_NAME}
     UNITY
     PCH PRIVATE "src/precomp.hpp")
diff --git a/src/core/reference/include/openvino/reference/convert.hpp b/src/core/reference/include/openvino/reference/convert.hpp
index 926af31dbf5130..efb1ec1ca21415 100644
--- a/src/core/reference/include/openvino/reference/convert.hpp
+++ b/src/core/reference/include/openvino/reference/convert.hpp
@@ -14,9 +14,11 @@
 #include "openvino/core/type/nf4.hpp"
 
 #if !defined(OS_CHROMEOS) && (defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64))
-#    define OV_CORE_USE_XBYAK_JIT 1
-#else
-#    define OV_CORE_USE_XBYAK_JIT 0
+#    define OV_CORE_USE_XBYAK_JIT
+#endif
+
+#if defined(OS_CHROMEOS) && defined(OPENVINO_ARCH_X86_64) && defined(HAVE_AVX2)
+#    define OV_CORE_USE_INTRINSICS
 #endif
 
 namespace ov {
@@ -33,12 +35,12 @@ namespace reference {
 namespace detail {
 
 template <typename TI, typename TO>
-typename std::enable_if<!std::is_same<TO, char>::value, TO>::type convert(const TI v) {
+constexpr typename std::enable_if<!std::is_same<TO, char>::value, TO>::type convert(const TI v) {
     return static_cast<TO>(v);
 }
 
 template <typename TI, typename TO>
-typename std::enable_if<std::is_same<TO, char>::value, TO>::type convert(const TI v) {
+constexpr typename std::enable_if<std::is_same<TO, char>::value, TO>::type convert(const TI v) {
     return static_cast<char>(static_cast<bool>(v));
 }
 }  // namespace detail
@@ -62,8 +64,6 @@ void convert(const TI* arg, TO* out, const size_t count) {
     std::transform(arg, arg + count, out, detail::convert<TI, TO>);
 }
 
-#if OV_CORE_USE_XBYAK_JIT
-
 template <>
 void convert<uint8_t, float16>(const uint8_t* arg, float16* out, size_t count);
 template <>
@@ -79,8 +79,6 @@ void convert<bfloat16, float16>(const bfloat16* arg, float16* out, size_t count)
 template <>
 void convert<bfloat16, float>(const bfloat16* arg, float* out, size_t count);
 
-#endif  // OV_CORE_USE_XBYAK_JIT
-
 template <>
 void convert<int32_t, float16>(const int32_t* arg, float16* out, size_t count);
 
diff --git a/src/core/reference/include/openvino/reference/utils/convert_util.hpp b/src/core/reference/include/openvino/reference/utils/convert_util.hpp
new file mode 100644
index 00000000000000..3be10c9fd19fbb
--- /dev/null
+++ b/src/core/reference/include/openvino/reference/utils/convert_util.hpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <functional>
+#include <type_traits>
+
+#include "openvino/reference/convert.hpp"
+
+namespace ov {
+
+// forward declare from inference dev API (cannot be included)
+extern bool with_cpu_x86_avx2();
+
+namespace reference {
+
+struct NoClamp {
+    static constexpr bool enabled = false;
+
+    // Generic implementation
+    template <class T>
+    static constexpr T apply(const T v) {
+        return v;
+    }
+
+    // Specialize for optimization
+    template <class T, class R>
+    static R apply(const T v);
+};
+
+template <class TI, class TO>
+struct Clamp {
+    static constexpr bool enabled = true;
+
+    // Generic implementation
+    static constexpr TO apply(const TI v) {
+        return (v < std::numeric_limits<TO>::lowest())
+                   ? std::numeric_limits<TO>::lowest()
+                   : ((v > std::numeric_limits<TO>::max()) ? std::numeric_limits<TO>::max()
+                                                           : detail::convert<TI, TO>(v));
+    }
+
+    // Specialize for optimization
+    template <class T, class R>
+    static R apply(const T v);
+};
+
+template <class TI, class TO>
+struct Converter {
+    static constexpr size_t vec_f32_size = 32 / sizeof(float);
+
+    // Generic implementation to convert tail elements
+    template <class ClampMode>
+    static void tail(const TI* in, TO* out, size_t n) {
+        std::transform(in, in + n, out, [](const TI v) {
+            return detail::convert<decltype(ClampMode::apply(v)), TO>(ClampMode::apply(v));
+        });
+    }
+
+    // Helper struct to defined optimized version of conversion
+    template <class ClampMode>
+    struct Optimized {
+        static constexpr bool enabled = false;
+        static void run(const TI* in, TO* out) {}
+    };
+
+    // Generic implementation of conversion
+    template <class ClampMode, typename std::enable_if<!Optimized<ClampMode>::enabled>::type* = nullptr>
+    static void apply(const TI* in, TO* out, size_t n) {
+        return tail<ClampMode>(in, out, n);
+    }
+
+    // Enabled when Optimized struct specialized defined for optimization
+    template <class ClampMode, typename std::enable_if<Optimized<ClampMode>::enabled>::type* = nullptr>
+    static void apply(const TI* in, TO* out, size_t n) {
+        if (with_cpu_x86_avx2()) {
+            for (; n >= vec_f32_size; n -= vec_f32_size, in += vec_f32_size, out += vec_f32_size) {
+                Optimized<ClampMode>::run(in, out);
+            }
+        }
+        tail<ClampMode>(in, out, n);
+    }
+};
+
+}  // namespace reference
+}  // namespace ov
diff --git a/src/core/reference/include/openvino/reference/utils/convert_x86_intrinsics.hpp b/src/core/reference/include/openvino/reference/utils/convert_x86_intrinsics.hpp
new file mode 100644
index 00000000000000..1716f05ba6ac78
--- /dev/null
+++ b/src/core/reference/include/openvino/reference/utils/convert_x86_intrinsics.hpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#ifdef OV_CORE_USE_INTRINSICS
+#    include <immintrin.h>
+
+#    include "openvino/reference/utils/convert_util.hpp"
+
+namespace ov {
+namespace reference {
+#    ifdef HAVE_AVX2
+
+// Clamp optimized specializations
+template <>
+__m128i NoClamp::apply<__m256i, __m128i>(const __m256i vec_i32);
+
+template <>
+template <>
+__m256 Clamp<float, float16>::apply<__m256, __m256>(const __m256 vec_f32);
+
+// Conversion optimized specializations
+// --- f32 -> other
+template <>
+template <>
+struct Converter<float, float16>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const float* in, float16* out);
+};
+
+template <>
+template <>
+struct Converter<float, float16>::Optimized<Clamp<float, float16>> {
+    static constexpr bool enabled = true;
+    static void run(const float* in, float16* out);
+};
+
+template <>
+template <>
+struct Converter<float, int8_t>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const float* in, int8_t* out);
+};
+
+// --- f16 -> other
+template <>
+template <>
+struct Converter<float16, float>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const float16* in, float* out);
+};
+
+template <>
+template <>
+struct Converter<float16, int8_t>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const float16* in, int8_t* out);
+};
+
+// --- bf16 -> other
+template <>
+template <>
+struct Converter<bfloat16, float16>::Optimized<Clamp<float, float16>> {
+    static constexpr bool enabled = true;
+    static void run(const bfloat16* in, float16* out);
+};
+
+template <>
+template <>
+struct Converter<bfloat16, float>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const bfloat16* in, float* out);
+};
+
+// --- u8 -> other
+template <>
+template <>
+struct Converter<uint8_t, float16>::Optimized<NoClamp> {
+    static constexpr bool enabled = true;
+    static void run(const uint8_t* in, float16* out);
+};
+#    endif  // HAVE_AVX2
+}  // namespace reference
+}  // namespace ov
+#endif
diff --git a/src/core/reference/src/op/convert.cpp b/src/core/reference/src/op/convert.cpp
index 18ad7a6f4717e5..5054121b5615c0 100644
--- a/src/core/reference/src/op/convert.cpp
+++ b/src/core/reference/src/op/convert.cpp
@@ -4,18 +4,23 @@
 
 #include "openvino/reference/convert.hpp"
 
-#if OV_CORE_USE_XBYAK_JIT
+#include "openvino/reference/utils/convert_util.hpp"
+
+#ifdef OV_CORE_USE_XBYAK_JIT
 #    include "jit_generator.hpp"
+#endif
 
-using namespace ov::runtime;
-#endif  // OV_CORE_USE_XBYAK_JIT
+#ifdef OV_CORE_USE_INTRINSICS
+#    include "openvino/reference/utils/convert_x86_intrinsics.hpp"
+#endif
 
 namespace ov {
 namespace reference {
-#if OV_CORE_USE_XBYAK_JIT
+
 namespace {
+#ifdef OV_CORE_USE_XBYAK_JIT
 template <typename src_t, typename dst_t, bool clamp = false>
-void jit_convert_vec(jit::Generator&, const Xbyak::RegExp&, const Xbyak::RegExp&);
+void jit_convert_vec(jit::Generator&, const Xbyak::RegExp&, const Xbyak::RegExp&) {}
 
 template <typename src_t, typename dst_t, bool clamp = false>
 void jit_convert_vec_prepare(jit::Generator&) {}
@@ -265,40 +270,6 @@ class jit_convert_array : public jit::Generator {
     }
 };
 
-template <typename TI, typename TO, bool clamp = false>
-void convert_impl(const TI* arg, TO* out, size_t count) {
-    auto converter = jit_convert_array::get<TI, TO, clamp>();
-
-    if (converter) {
-        jit_convert_array::args_t args = {arg, out, count};
-        converter(&args);
-    } else {
-        for (size_t i = 0; i < count; ++i) {
-            out[i] = static_cast<TO>(arg[i]);
-        }
-    }
-}
-
-template <>
-void convert_impl<float, float16, true>(const float* arg, float16* out, size_t count) {
-    auto converter = jit_convert_array::get<float, float16, true>();
-
-    if (converter) {
-        jit_convert_array::args_t args = {arg, out, count};
-        converter(&args);
-    } else {
-        for (size_t i = 0; i < count; ++i) {
-            if (arg[i] > std::numeric_limits<ov::float16>::max()) {
-                out[i] = std::numeric_limits<ov::float16>::max();
-            } else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
-                out[i] = std::numeric_limits<ov::float16>::lowest();
-            } else {
-                out[i] = static_cast<ov::float16>(arg[i]);
-            }
-        }
-    }
-}
-
 template <typename data_t, typename range_t>
 void jit_count_out_of_range_vec_prepare(jit::Generator&) {}
 
@@ -504,114 +475,88 @@ class jit_count_out_of_range : public jit::Generator {
     }
 };
 
+#endif  // OV_CORE_USE_XBYAK_JIT
+
+template <class Clamp, typename TI, typename TO>
+void convert_impl(const TI* arg, TO* out, size_t count) {
+#ifdef OV_CORE_USE_XBYAK_JIT
+    if (auto converter = jit_convert_array::get<TI, TO, Clamp::enabled>()) {
+        jit_convert_array::args_t args = {arg, out, count};
+        converter(&args);
+    } else
+#endif
+    {
+        Converter<TI, TO>::template apply<Clamp>(arg, out, count);
+    }
+}
 }  // namespace
 
 template <>
 void convert<uint8_t, float16>(const uint8_t* arg, float16* out, size_t count) {
-    convert_impl(arg, out, count);
+    convert_impl<NoClamp>(arg, out, count);
 }
 
 template <>
 void convert<float16, float>(const float16* arg, float* out, size_t count) {
-    convert_impl(arg, out, count);
+    convert_impl<NoClamp>(arg, out, count);
 }
 
 template <>
 void convert<float, float16>(const float* arg, float16* out, size_t count) {
-    convert_impl(arg, out, count);
+    convert_impl<NoClamp>(arg, out, count);
 }
 
 template <>
 void convert<float, int8_t>(const float* arg, int8_t* out, size_t count) {
-    convert_impl(arg, out, count);
+    convert_impl<NoClamp>(arg, out, count);
 }
 
 template <>
 void convert<float16, int8_t>(const float16* arg, int8_t* out, size_t count) {
-    convert_impl(arg, out, count);
+    convert_impl<NoClamp>(arg, out, count);
 }
 
 template <>
 void convert<bfloat16, float16>(const bfloat16* arg, float16* out, size_t count) {
-    convert_impl(arg, out, count);
+    convert_impl<NoClamp>(arg, out, count);
 }
 
 template <>
 void convert<bfloat16, float>(const bfloat16* arg, float* out, size_t count) {
-    convert_impl(arg, out, count);
+    convert_impl<NoClamp>(arg, out, count);
 }
 
-#endif  // OV_CORE_USE_XBYAK_JIT
-
 void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count) {
-#if OV_CORE_USE_XBYAK_JIT
-    convert_impl<float, float16, true>(arg, out, count);
-#else
-    // FIXME CVS-125496: duplicate and stub for ARM, provide optimized solution
-    for (size_t i = 0; i < count; ++i) {
-        if (arg[i] > std::numeric_limits<ov::float16>::max()) {
-            out[i] = std::numeric_limits<ov::float16>::max();
-        } else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
-            out[i] = std::numeric_limits<ov::float16>::lowest();
-        } else {
-            out[i] = static_cast<ov::float16>(arg[i]);
-        }
-    }
-#endif  // OV_CORE_USE_XBYAK_JIT
+    convert_impl<Clamp<float, float16>>(arg, out, count);
 }
 
 template <>
 void convert<int32_t, float16>(const int32_t* arg, float16* out, size_t count) {
-    for (size_t i = 0; i < count; ++i) {
-        if (arg[i] > std::numeric_limits<ov::float16>::max()) {
-            out[i] = std::numeric_limits<ov::float16>::max();
-        } else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
-            out[i] = std::numeric_limits<ov::float16>::lowest();
-        } else {
-            out[i] = static_cast<ov::float16>(arg[i]);
-        }
-    }
+    Converter<int32_t, float16>::apply<Clamp<int32_t, float16>>(arg, out, count);
 }
 
 void convert_from_bf16_to_f16_with_clamp(const bfloat16* arg, float16* out, size_t count) {
-#if OV_CORE_USE_XBYAK_JIT
-    convert_impl<bfloat16, float16, true>(arg, out, count);
-#else
+    // can re-use Clamp as bf16 is converted to float before clamping
+    using clamp_bf16_f16 = Clamp<float, float16>;
+    convert_impl<clamp_bf16_f16>(arg, out, count);
     // FIXME CVS-125496: duplicate and stub for ARM, provide optimized solution
-    for (size_t i = 0; i < count; ++i) {
-        if (arg[i] > std::numeric_limits<ov::float16>::max()) {
-            out[i] = std::numeric_limits<ov::float16>::max();
-        } else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
-            out[i] = std::numeric_limits<ov::float16>::lowest();
-        } else {
-            out[i] = static_cast<ov::float16>(arg[i]);
-        }
-    }
-#endif  // OV_CORE_USE_XBYAK_JIT
 }
 
 size_t count_out_of_f16_range(const float* arg, size_t count) {
-    size_t num_out_of_range = 0;
-
-#if OV_CORE_USE_XBYAK_JIT
-    auto converter = jit_count_out_of_range::get<float, float16>();
-    if (converter) {
+#ifdef OV_CORE_USE_XBYAK_JIT
+    if (auto converter = jit_count_out_of_range::get<float, float16>()) {
+        size_t num_out_of_range = 0;
         jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count};
         converter(&args);
         return num_out_of_range;
     }
 #endif  // OV_CORE_USE_XBYAK_JIT
-    for (size_t i = 0; i < count; ++i) {
-        // if abs value is smaller than the smallest positive fp16, but not zero
-        if (std::abs(arg[i]) < ov::float16::from_bits(0x0001) && arg[i] != 0.0f) {
-            num_out_of_range++;
-        } else if (arg[i] > std::numeric_limits<ov::float16>::max()) {
-            num_out_of_range++;
-        } else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
-            num_out_of_range++;
-        }
-    }
-    return num_out_of_range;
+    const auto is_out_of_f16_range = [](const float v) {
+        return (std::abs(v) < float16::from_bits(0x0001) && v != 0.0f) || (v > std::numeric_limits<float16>::max()) ||
+               (v < std::numeric_limits<float16>::lowest());
+    };
+
+    return std::count_if(arg, arg + count, is_out_of_f16_range);
 }
 
 }  // namespace reference
diff --git a/src/core/reference/src/op/convert_x86_intrinsics.cpp b/src/core/reference/src/op/convert_x86_intrinsics.cpp
new file mode 100644
index 00000000000000..78957feaaef3c7
--- /dev/null
+++ b/src/core/reference/src/op/convert_x86_intrinsics.cpp
@@ -0,0 +1,96 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/reference/convert.hpp"
+#if defined(OV_CORE_USE_INTRINSICS)
+#    include "openvino/reference/utils/convert_x86_intrinsics.hpp"
+
+namespace ov {
+namespace reference {
+
+#    if defined(HAVE_AVX2)
+template <>
+__m128i NoClamp::apply<__m256i, __m128i>(const __m256i vec_i32) {
+    // clang-format off
+    static const auto shuffle = _mm256_setr_epi8(0,  4,  8, 12, -1, -1, -1, -1,  -1, -1, -1, -1, -1, -1, -1, -1,
+                                                -1, -1, -1, -1,  0,  4,  8, 12,  -1, -1, -1, -1, -1, -1, -1, -1);
+    // clang-format on
+
+    const auto t = _mm256_shuffle_epi8(vec_i32, shuffle);
+    const auto low = _mm256_castsi256_si128(t);
+    const auto high = _mm256_extracti128_si256(t, 1);
+    return _mm_or_si128(low, high);
+}
+
+template <>
+template <>
+__m256 Clamp<float, float16>::apply<__m256, __m256>(const __m256 vec_f32) {
+    static const auto lo = _mm256_set1_ps(std::numeric_limits<float16>::lowest());
+    static const auto hi = _mm256_set1_ps(std::numeric_limits<float16>::max());
+
+    return _mm256_min_ps(_mm256_max_ps(vec_f32, lo), hi);
+}
+
+// --- f32 -> other
+void Converter<float, float16>::Optimized<Clamp<float, float16>>::run(const float* in, float16* out) {
+    auto vec_f32 = _mm256_loadu_ps(in);                                                        // load f32 input
+    auto vec_f16 = _mm256_cvtps_ph(Clamp<float, float16>::apply<__m256, __m256>(vec_f32), 0);  // f32 -> f16 with clamp
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(out), vec_f16);                                // store f16 output
+}
+
+void Converter<float, float16>::Optimized<NoClamp>::run(const float* in, float16* out) {
+    auto vec_f32 = _mm256_loadu_ps(in);                          // load f32 input
+    auto vec_f16 = _mm256_cvtps_ph(vec_f32, _MM_ROUND_NEAREST);  // f32 -> f16
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(out), vec_f16);  // store f16 output
+}
+
+void Converter<float, int8_t>::Optimized<NoClamp>::run(const float* in, int8_t* out) {
+    auto vec_f32 = _mm256_load_ps(in);                                 // load f32 input
+    auto vec_i32 = _mm256_cvttps_epi32(vec_f32);                       // f32 -> i32
+    auto vec_i8 = NoClamp::template apply<__m256i, __m128i>(vec_i32);  // i32 -> i8 no clamping
+    _mm_storeu_si64(out, vec_i8);                                      // store i8 output
+}
+
+// --- f16 -> other
+void Converter<float16, float>::Optimized<NoClamp>::run(const float16* in, float* out) {
+    auto vec_f16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(in));  // load input as f16 vector
+    auto vec_f32 = _mm256_cvtph_ps(vec_f16);                               // convert f16 -> f32
+    _mm256_storeu_ps(out, vec_f32);                                        // store f32 in output
+}
+
+void Converter<float16, int8_t>::Optimized<NoClamp>::run(const float16* in, int8_t* out) {
+    const auto vec_f16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(in));  // load input as f16 vector
+    const auto vec_f32 = _mm256_cvtph_ps(vec_f16);                               // convert f16 -> f32
+    auto vec_i32 = _mm256_cvttps_epi32(vec_f32);                                 // f32 -> i32
+    auto vec_i8 = NoClamp::apply<__m256i, __m128i>(vec_i32);                     // i32 -> i8 no clamp
+    _mm_storeu_si64(out, vec_i8);                                                // store i8 output
+}
+
+// --- bf16 -> other
+void Converter<bfloat16, float16>::Optimized<Clamp<float, float16>>::run(const bfloat16* in, float16* out) {
+    auto vec_bf16 = _mm256_cvtepu16_epi32(*reinterpret_cast<const __m128i*>(in));  // expand to 32-bits
+    auto vec_f32 = _mm256_castsi256_ps(_mm256_slli_epi32(vec_bf16, 16));           // shift left bf16 -> f32
+    auto vec_f16 =
+        _mm256_cvtps_ph(Clamp<float, float16>::apply<__m256, __m256>(vec_f32), _MM_ROUND_NEAREST);  // f32 -> f16
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(out), vec_f16);                                     // store f16
+}
+
+void Converter<bfloat16, float>::Optimized<NoClamp>::run(const bfloat16* in, float* out) {
+    auto vec_f32 = _mm256_cvtepu16_epi32(*reinterpret_cast<const __m128i*>(in));  // expand to 32-bits
+    vec_f32 = _mm256_slli_epi32(vec_f32, 16);                                     // shift left bf16 -> f32
+    _mm256_storeu_ps(out, _mm256_castsi256_ps(vec_f32));                          // store f32 in output
+}
+
+// --- u8 -> other
+void Converter<uint8_t, float16>::Optimized<NoClamp>::run(const uint8_t* in, float16* out) {
+    auto i64 = _mm_loadu_si64(in);                               // load u8 input
+    auto vec_i32 = _mm256_cvtepu8_epi32(i64);                    // u8 -> i32
+    auto vec_f32 = _mm256_cvtepi32_ps(vec_i32);                  // i32 -> f32
+    auto vec_f16 = _mm256_cvtps_ph(vec_f32, _MM_ROUND_NEAREST);  // f32 -> f16
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(out), vec_f16);  // store f16 output
+}
+#    endif  // HAVE_AVX2
+}  // namespace reference
+}  // namespace ov
+#endif  // OV_CORE_USE_INTRINSICS
diff --git a/src/core/reference/src/op/jit_generator.cpp b/src/core/reference/src/op/jit_generator.cpp
index d516d210e71967..7d7da06d5da8d5 100644
--- a/src/core/reference/src/op/jit_generator.cpp
+++ b/src/core/reference/src/op/jit_generator.cpp
@@ -16,7 +16,7 @@
 #    include "openvino/core/type/float16.hpp"
 
 namespace ov {
-namespace runtime {
+namespace reference {
 namespace jit {
 using namespace Xbyak;
 
@@ -191,7 +191,7 @@ void Generator::copy<bfloat16>(const Xbyak::Reg64& dst, const Xbyak::Reg64& src,
     copy<uint16_t>(dst, src, size);
 }
 }  // namespace jit
-}  // namespace runtime
+}  // namespace reference
 }  // namespace ov
 
 #endif  // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
diff --git a/src/core/reference/src/op/jit_generator.hpp b/src/core/reference/src/op/jit_generator.hpp
index 924d582d8bc8d4..b4b9cd7a60c23f 100644
--- a/src/core/reference/src/op/jit_generator.hpp
+++ b/src/core/reference/src/op/jit_generator.hpp
@@ -5,90 +5,87 @@
 #pragma once
 
 #if defined _WIN32 && !defined NOMINMAX
-#define NOMINMAX
+#    define NOMINMAX
 #endif
 
-#include <functional>
 #include <xbyak/xbyak.h>
 
+#include <functional>
+
 namespace ov {
-namespace runtime {
+namespace reference {
 namespace jit {
 #ifdef XBYAK64
-            static const Xbyak::Operand::Code abi_save_gpr_regs[] = {
-                Xbyak::Operand::RBX,
-                Xbyak::Operand::RBP,
-                Xbyak::Operand::R12,
-                Xbyak::Operand::R13,
-                Xbyak::Operand::R14,
-                Xbyak::Operand::R15,
-#ifdef _WIN32
-                Xbyak::Operand::RDI,
-                Xbyak::Operand::RSI,
-#endif
-            };
+static const Xbyak::Operand::Code abi_save_gpr_regs[] = {
+    Xbyak::Operand::RBX,
+    Xbyak::Operand::RBP,
+    Xbyak::Operand::R12,
+    Xbyak::Operand::R13,
+    Xbyak::Operand::R14,
+    Xbyak::Operand::R15,
+#    ifdef _WIN32
+    Xbyak::Operand::RDI,
+    Xbyak::Operand::RSI,
+#    endif
+};
 
-#ifdef _WIN32
-#define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX
-#else
-#define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI
-#endif
+#    ifdef _WIN32
+#        define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX)  // RCX
+#    else
+#        define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI)  // RDI
+#    endif
 #endif  // XBYAK64
 
-            class Generator : public Xbyak::CodeGenerator
-            {
-                static constexpr size_t xmm_len = 16;
+class Generator : public Xbyak::CodeGenerator {
+    static constexpr size_t xmm_len = 16;
 
 #ifdef _WIN32
-                static constexpr size_t xmm_to_preserve_start = 6;
-                static constexpr size_t xmm_to_preserve = 10;
+    static constexpr size_t xmm_to_preserve_start = 6;
+    static constexpr size_t xmm_to_preserve = 10;
 #else
-                static constexpr size_t xmm_to_preserve_start = 0;
-                static constexpr size_t xmm_to_preserve = 0;
+    static constexpr size_t xmm_to_preserve_start = 0;
+    static constexpr size_t xmm_to_preserve = 0;
 #endif
 
-                static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]);
-                const size_t size_of_abi_save_regs;
-
-                const Xbyak::Reg64 reg_EVEX_max_8b_offt;
-                static constexpr int EVEX_max_8b_offt = 0x200;
-
-            public:
-                const Xbyak::Reg64 param = abi_param1;
-
-                typedef enum
-                {
-                    isa_any,
-                    sse42,
-                    avx,
-                    avx2,
-                    avx512_common,
-                    avx512_core,
-                    avx512_core_vnni,
-                    avx512_mic,
-                    avx512_mic_4ops,
-                    avx512_core_bf16,
-                    avx512_vpopcnt,
-                    fp16
-                } cpu_isa_t;
-
-                static bool mayiuse(const cpu_isa_t cpu_isa);
-                static bool is_x64();
-
-                Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024);
-                void preamble();
-                void postamble();
-
-                void foreach (const Xbyak::Reg64& idx,
-                              size_t step,
-                              const Xbyak::Reg64& end,
-                              std::function<void(const Xbyak::Reg64&)> && fn);
-
-                template <typename T>
-                void copy(const Xbyak::Reg64& dst,
-                          const Xbyak::Reg64& src,
-                          const Xbyak::Reg64& size);
-            };
-        }
-    }
-    }  // namespace ov
+    static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]);
+    const size_t size_of_abi_save_regs;
+
+    const Xbyak::Reg64 reg_EVEX_max_8b_offt;
+    static constexpr int EVEX_max_8b_offt = 0x200;
+
+public:
+    const Xbyak::Reg64 param = abi_param1;
+
+    typedef enum {
+        isa_any,
+        sse42,
+        avx,
+        avx2,
+        avx512_common,
+        avx512_core,
+        avx512_core_vnni,
+        avx512_mic,
+        avx512_mic_4ops,
+        avx512_core_bf16,
+        avx512_vpopcnt,
+        fp16
+    } cpu_isa_t;
+
+    static bool mayiuse(const cpu_isa_t cpu_isa);
+    static bool is_x64();
+
+    Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024);
+    void preamble();
+    void postamble();
+
+    void foreach (const Xbyak::Reg64& idx,
+                  size_t step,
+                  const Xbyak::Reg64& end,
+                  std::function<void(const Xbyak::Reg64&)> && fn);
+
+    template <typename T>
+    void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size);
+};
+}  // namespace jit
+}  // namespace reference
+}  // namespace ov