From bb28b718492d0b9c16c2737862995d890c42f4c4 Mon Sep 17 00:00:00 2001
From: Kelvin Choi <kelvin.choi@intel.com>
Date: Tue, 31 Oct 2023 19:21:56 +0900
Subject: [PATCH 1/9] [GPU] if the reorder which is an user of reshpae has
 truncation mode, it should not split the reorder node (#20749)

---
 .../graph/graph_optimizer/handle_reshape.cpp  |  3 +-
 .../unit/test_cases/reshape_gpu_test.cpp      | 49 +++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
index 7b5eb3b02d33e6..c148b311d55744 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
@@ -98,7 +98,8 @@ void handle_reshape::run(program& p) {
             // find the users of reshape that are reorder type, if none present then skip the current node
             // find users who are onednn impl
             for (const auto& user : node->get_users()) {
-                if (user->is_type<reorder>())
+                if (user->is_type<reorder>() &&
+                    (*user).as<reorder>().get_primitive()->truncate == false)   // not to split conversion only reorder
                     reorder_node_to_split.push_back(user);
                 if (user->get_preferred_impl_type() == cldnn::impl_types::onednn)
                     onednn_users.push_back(user);
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reshape_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reshape_gpu_test.cpp
index 0d7c6cbe271f41..6f7cc179465667 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/reshape_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/reshape_gpu_test.cpp
@@ -734,6 +734,55 @@ TEST(reshape_gpu_f32, shrink_chain_out) {
     test_shrink_chain_out<float>(false);
 }
 
+template <typename T>
+void test_shrink_chain_partial_reorder_truncate(bool is_caching_test) {
+    auto& engine = get_test_engine();
+    auto batch_num = 2;
+    auto feature_num = 2;
+    auto x_size = 1;
+    auto y_size = 1;
+    auto input = engine.allocate_memory({data_types::f32, format::bfyx, {tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num))}});
+    auto scale_in = engine.allocate_memory({data_types::f32, format::bfyx, { tensor(feature(4)) }});
+    auto shift_in = engine.allocate_memory({data_types::f32, format::bfyx, { tensor(feature(4)) }});
+
+    std::vector<T> scale_vals = {0.f, 1.f, 2.f, 3.f};
+    std::vector<T> scale_shifts = {5.f, 10.f, 15.f, 20.0f};
+    set_values(scale_in, scale_vals);
+    set_values(shift_in, scale_shifts);
+
+    topology topology;
+    topology.add(input_layout("input", input->get_layout()));
+    topology.add(data("scale_in", scale_in));
+    topology.add(data("shift_in", shift_in));
+    topology.add(activation("relu", input_info("input"), activation_func::relu));
+    topology.add(reshape("reshape", input_info("relu"), tensor(spatial(2, 2))));
+    topology.add(reorder("reorder", input_info("reshape"), format::bfyx, data_types::f32, {}, reorder_mean_mode::subtract, padding(), true));
+    topology.add(reshape("reshape1", input_info("reorder"), tensor(feature(4))));
+    topology.add(eltwise("scale", { input_info("reshape1"), input_info("scale_in") }, eltwise_mode::prod));
+    topology.add(eltwise("shift", { input_info("scale"), input_info("shift_in") }, eltwise_mode::sum));
+    topology.add(reorder("out_reorder", input_info("shift"), format::yxfb, data_types::f32));
+
+    std::vector<T> input_vec = {-1.f, 2.f, -3.f, 4.f};
+    std::vector<T> out = {5.f, 12.f, 15.f, 32.0f};
+    set_values(input, input_vec);
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
+    network->set_input_data("input", input);
+    auto outputs = network->execute();
+
+    auto output = outputs.at("out_reorder").get_memory();
+    cldnn::mem_lock<T> output_ptr(output, get_test_stream());
+
+    for (size_t i = 0; i < out.size(); i++)
+        ASSERT_EQ(output_ptr[i], out[i]) << " i=" << i;
+}
+
+TEST(reshape_gpu_f32, shrink_chain_partial_reorder_truncate) {
+    test_shrink_chain_partial_reorder_truncate<float>(false);
+}
+
 TEST(reshape_gpu_f32, basic_runtime_static_shape) {
     // input:  bfwzyx, (3, 3, 2, 2, 1, 1)
     // reshape: (1, 1, 2, 2, 3, 3), pad (0, 0, 0, 0, 0, 1)

From a20c9a5a22ed5f50e3caa8479ac6c36f3540b190 Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Tue, 31 Oct 2023 14:45:54 +0400
Subject: [PATCH 2/9] [TF FE] Update TF FE dev docs with Inv (#20788)

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 src/frontends/tensorflow/docs/supported_ops.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/frontends/tensorflow/docs/supported_ops.md b/src/frontends/tensorflow/docs/supported_ops.md
index 5794e3f16653fd..e9b9a499f55a76 100644
--- a/src/frontends/tensorflow/docs/supported_ops.md
+++ b/src/frontends/tensorflow/docs/supported_ops.md
@@ -529,7 +529,7 @@ A "supported operation" is one that TensorFlow Frontend can convert to the OpenV
 | InplaceSub                                              | NO                            |                               |
 | InplaceUpdate                                           | NO                            |                               |
 | InterleaveDataset                                       | NO                            |                               |
-| Inv                                                     | NO                            |                               |
+| Inv                                                     | YES                           |                               |
 | InvGrad                                                 | NO                            |                               |
 | Invert                                                  | NO                            |                               |
 | InvertPermutation                                       | YES                           |                               |

From da1f0199a0438f80d59c36880fe65c2b567366cd Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 31 Oct 2023 14:55:28 +0400
Subject: [PATCH 3/9] Removed WA for static protobuf for vcpkg installation
 path (#20784)

---
 .../installing-openvino-vcpkg.md              | 31 -------------------
 1 file changed, 31 deletions(-)

diff --git a/docs/articles_en/get started/installing-openvino-overview/installing-openvino-shared/installing-openvino-vcpkg.md b/docs/articles_en/get started/installing-openvino-overview/installing-openvino-shared/installing-openvino-vcpkg.md
index d0d502cfced7e2..4da210edb0d581 100644
--- a/docs/articles_en/get started/installing-openvino-overview/installing-openvino-shared/installing-openvino-vcpkg.md	
+++ b/docs/articles_en/get started/installing-openvino-overview/installing-openvino-shared/installing-openvino-vcpkg.md	
@@ -65,37 +65,6 @@ Installing OpenVINO Runtime
 Note that the vcpkg installation means building all packages and dependencies from source, 
 which means the compiler stage will require additional time to complete the process. 
 
-.. important::
-
-   If you are building OpenVINO as dynamic libraries and you want to use either Paddle, TensorFlow or ONNX frontends, you need to create `custom vcpkg <https://learn.microsoft.com/en-us/vcpkg/users/triplets#per-port-customization>`__ triplet file, like ``<VCPKG_ROOT>/triplets/community/x64-linux-release-dynamic.cmake``, which builds ``protobuf`` dependency statically:
-
-   .. code-block:: sh
-
-      # typical values of vcpkg toolchain
-      set(VCPKG_TARGET_ARCHITECTURE x64)
-      set(VCPKG_CRT_LINKAGE dynamic)
-      # by default, all libraries are built dynamically
-      set(VCPKG_LIBRARY_LINKAGE dynamic)
-
-      set(VCPKG_CMAKE_SYSTEM_NAME Linux)
-      set(VCPKG_BUILD_TYPE release)
-
-      set(VCPKG_FIXUP_ELF_RPATH ON)
-
-      # OpenVINO specific additions: build statically the following internal dependencies
-      # IMPORTANT: you need to build at least protobuf statically, others can be dynamic
-      if(PORT MATCHES "^(ade|hwloc|onnx|protobuf|pugixml|snappy)$")
-          set(VCPKG_LIBRARY_LINKAGE static)
-      endif()
-
-
-   Then, you can use such a triplet file with the following command:
-
-   .. code-block:: sh
-
-      vcpkg install 'openvino:x64-linux-release-dynamic'
-
-
 After installation, you can use OpenVINO in your product's cmake scripts:
 
 .. code-block:: sh

From 2932e9e9381c8f07cef200931470fe19044590d5 Mon Sep 17 00:00:00 2001
From: Vladislav Golubev <vladislav.golubev@intel.com>
Date: Tue, 31 Oct 2023 12:00:52 +0100
Subject: [PATCH 4/9] ReshapeBMatMul and ReshapeAMatMul: avoid circular
 dependencies creation (#20771)

---
 .../smart_reshape/matmul_sr.cpp               | 10 ++++--
 .../tests/functional/matmul_sr_tests.cpp      | 35 +++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/src/common/transformations/src/transformations/smart_reshape/matmul_sr.cpp b/src/common/transformations/src/transformations/smart_reshape/matmul_sr.cpp
index 870b69d9a55901..ff7dc8c927d0ae 100644
--- a/src/common/transformations/src/transformations/smart_reshape/matmul_sr.cpp
+++ b/src/common/transformations/src/transformations/smart_reshape/matmul_sr.cpp
@@ -61,8 +61,11 @@ ov::pass::ReshapeAMatMul::ReshapeAMatMul() {
     auto other_input_label = pattern::any_input();
     auto reshape_input_label = pattern::any_input();
     auto reshape_pattern_label = pattern::any_input();
+    auto reshape_predicate = [](ov::Output<ov::Node> output) -> bool {
+        return ov::pass::pattern::rank_equals(2)(output) && ov::pass::pattern::consumers_count(1)(output);
+    };
     auto reshape_label = ov::pass::pattern::wrap_type<ov::op::v1::Reshape>({reshape_input_label, reshape_pattern_label},
-                                                                           ov::pass::pattern::rank_equals(2));
+                                                                           reshape_predicate);
     auto matmul_label = ov::pass::pattern::wrap_type<ov::op::v0::MatMul>({reshape_label, other_input_label});
 
     matcher_pass_callback callback = [=](pattern::Matcher& m) -> bool {
@@ -83,8 +86,11 @@ ov::pass::ReshapeBMatMul::ReshapeBMatMul() {
     auto other_input_label = pattern::any_input();
     auto reshape_input_label = pattern::any_input();
     auto reshape_pattern_label = pattern::any_input();
+    auto reshape_predicate = [](ov::Output<ov::Node> output) -> bool {
+        return ov::pass::pattern::rank_equals(2)(output) && ov::pass::pattern::consumers_count(1)(output);
+    };
     auto reshape_label = ov::pass::pattern::wrap_type<ov::op::v1::Reshape>({reshape_input_label, reshape_pattern_label},
-                                                                           ov::pass::pattern::rank_equals(2));
+                                                                           reshape_predicate);
     auto matmul_label = ov::pass::pattern::wrap_type<ov::op::v0::MatMul>({other_input_label, reshape_label});
 
     matcher_pass_callback callback = [=](pattern::Matcher& m) -> bool {
diff --git a/src/inference/tests/functional/matmul_sr_tests.cpp b/src/inference/tests/functional/matmul_sr_tests.cpp
index 27a294e656e171..3d17cfd915fa58 100644
--- a/src/inference/tests/functional/matmul_sr_tests.cpp
+++ b/src/inference/tests/functional/matmul_sr_tests.cpp
@@ -10,11 +10,14 @@
 
 #include "cnn_network_ngraph_impl.hpp"
 #include "common_test_utils/graph_comparator.hpp"
+#include "common_test_utils/ov_test_utils.hpp"
 #include "common_test_utils/test_common.hpp"
 #include "ie_common.h"
+#include "openvino/op/add.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/matmul.hpp"
 #include "openvino/op/parameter.hpp"
+#include "openvino/op/reduce_max.hpp"
 #include "openvino/op/reshape.hpp"
 #include "openvino/op/transpose.hpp"
 #include "openvino/op/variadic_split.hpp"
@@ -357,3 +360,35 @@ TEST(SmartReshapeTransposeMatMulTests, TransposeBothMatMulWithAttrFuse) {
     auto res = compare_functions(f, f_ref);
     ASSERT_TRUE(res.first) << res.second;
 }
+
+TEST_F(TransformationTestsF, SmartReshapeReshapeAMatMulSeveralConsumers) {
+    // Reshape has 2 consumers: matmul and reduce.
+    // Since reshape movement leads to loop creation (circular dependencies), the transformation can't be applied
+    auto data_A = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{3, 2, 3});
+    auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, {2}, {3, 6});
+    auto reshape = std::make_shared<ov::op::v1::Reshape>(data_A, reshape_const, false);
+
+    auto data_B = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{6, 12});
+    auto reduction_axes = ov::op::v0::Constant::create(ov::element::i32, {2}, {0, 1});
+    auto reduce = std::make_shared<ov::op::v1::ReduceMax>(reshape, reduction_axes);
+    auto sum = std::make_shared<ov::op::v1::Add>(data_B, reduce);
+    auto matmul = std::make_shared<ov::op::v0::MatMul>(reshape, sum);
+    model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{data_A, data_B});
+    manager.register_pass<ov::pass::ReshapeAMatMul>();
+}
+
+TEST_F(TransformationTestsF, SmartReshapeReshapeBMatMulSeveralConsumers) {
+    // Reshape has 2 consumers: matmul and reduce.
+    // Since reshape movement leads to loop creation (circular dependencies), the transformation can't be applied
+    auto data_B = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{3, 2, 3});
+    auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, {2}, {6, 3});
+    auto reshape = std::make_shared<ov::op::v1::Reshape>(data_B, reshape_const, false);
+
+    auto data_A = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{12, 6});
+    auto reduction_axes = ov::op::v0::Constant::create(ov::element::i32, {2}, {0, 1});
+    auto reduce = std::make_shared<ov::op::v1::ReduceMax>(reshape, reduction_axes);
+    auto sum = std::make_shared<ov::op::v1::Add>(data_A, reduce);
+    auto matmul = std::make_shared<ov::op::v0::MatMul>(sum, reshape);
+    model = std::make_shared<ov::Model>(ov::NodeVector{matmul}, ov::ParameterVector{data_A, data_B});
+    manager.register_pass<ov::pass::ReshapeBMatMul>();
+}

From 3077bad26fe652503cdcbd2da4b75dd345a97809 Mon Sep 17 00:00:00 2001
From: Pawel Raasz <pawel.raasz@intel.com>
Date: Tue, 31 Oct 2023 12:11:29 +0100
Subject: [PATCH 5/9] [core]Migrate Sigmoid operator to new API (#20780)

* Migrate Sigmoid operator to new API

* Add missing include
---
 src/core/include/openvino/op/sigmoid.hpp      |  4 +-
 .../include/openvino/reference/sigmoid.hpp    | 32 +++---
 src/core/src/op/sigmoid.cpp                   | 98 ++++++++-----------
 3 files changed, 61 insertions(+), 73 deletions(-)

diff --git a/src/core/include/openvino/op/sigmoid.hpp b/src/core/include/openvino/op/sigmoid.hpp
index eaf6bfa14afad4..9c244e2681f7fc 100644
--- a/src/core/include/openvino/op/sigmoid.hpp
+++ b/src/core/include/openvino/op/sigmoid.hpp
@@ -18,9 +18,7 @@ class OPENVINO_API Sigmoid : public util::UnaryElementwiseArithmetic {
     Sigmoid(const Output<Node>& arg);
     Sigmoid() = default;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    bool evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
+    bool evaluate(TensorVector& outputs, const TensorVector& inputs) const override;
     bool has_evaluate() const override;
 };
 }  // namespace v0
diff --git a/src/core/reference/include/openvino/reference/sigmoid.hpp b/src/core/reference/include/openvino/reference/sigmoid.hpp
index d30aedf21ae9f4..4e1daafeff3d0b 100644
--- a/src/core/reference/include/openvino/reference/sigmoid.hpp
+++ b/src/core/reference/include/openvino/reference/sigmoid.hpp
@@ -4,28 +4,30 @@
 
 #pragma once
 
+#include <algorithm>
 #include <cmath>
 #include <cstddef>
-#include <type_traits>
+
+#include "openvino/reference/utils/type_util.hpp"
 
 namespace ov {
 namespace reference {
-template <typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
-void sigmoid(const T* arg, T* out, size_t count) {
-    T exp_value;
-    for (size_t i = 0; i < count; i++) {
-        exp_value = static_cast<T>(std::exp(-static_cast<typename std::make_signed<T>::type>(arg[i])));
-        out[i] = static_cast<T>(1 / (1 + exp_value));
-    }
+namespace func {
+template <class T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+T sigmoid(const T value) {
+    const auto exp_value = static_cast<T>(std::exp(-static_cast<typename std::make_signed<T>::type>(value)));
+    return 1 / (1 + exp_value);
+}
+
+template <class T, typename std::enable_if<ov::is_floating_point<T>()>::type* = nullptr>
+T sigmoid(const T value) {
+    return 1 / (1 + std::exp(-value));
 }
+}  // namespace func
 
-template <typename T, typename std::enable_if<!std::is_integral<T>::value, bool>::type = true>
-void sigmoid(const T* arg, T* out, size_t count) {
-    T exp_value;
-    for (size_t i = 0; i < count; i++) {
-        exp_value = static_cast<T>(std::exp(-arg[i]));
-        out[i] = static_cast<T>(1 / (1 + exp_value));
-    }
+template <class T>
+void sigmoid(const T* arg, T* out, const size_t count) {
+    std::transform(arg, arg + count, out, func::sigmoid<T>);
 }
 }  // namespace reference
 }  // namespace ov
diff --git a/src/core/src/op/sigmoid.cpp b/src/core/src/op/sigmoid.cpp
index 9966dbcab8d69b..a4ce31db1e3a97 100644
--- a/src/core/src/op/sigmoid.cpp
+++ b/src/core/src/op/sigmoid.cpp
@@ -2,80 +2,68 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "ngraph/op/sigmoid.hpp"
-
-#include <ngraph/validation_util.hpp>
+#include "openvino/op/sigmoid.hpp"
 
+#include "element_visitor.hpp"
 #include "itt.hpp"
-#include "ngraph/log.hpp"
-#include "ngraph/runtime/host_tensor.hpp"
-#include "ngraph/util.hpp"
 #include "openvino/reference/sigmoid.hpp"
 
-using namespace std;
-using namespace ngraph;
+namespace ov {
+namespace op {
+namespace sigmoid {
+
+struct Evaluate : element::NoAction<bool> {
+    using element::NoAction<bool>::visit;
 
-shared_ptr<Node> ov::op::v0::Sigmoid::clone_with_new_inputs(const OutputVector& new_args) const {
+    template <element::Type_t ET, class T = fundamental_type_for<ET>>
+    static result_type visit(const Tensor& arg0, Tensor& out, const size_t count) {
+        reference::sigmoid(arg0.data<const T>(), out.data<T>(), count);
+        return true;
+    }
+};
+}  // namespace sigmoid
+
+namespace v0 {
+
+std::shared_ptr<Node> Sigmoid::clone_with_new_inputs(const OutputVector& new_args) const {
     OV_OP_SCOPE(v0_Sigmoid_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return make_shared<Sigmoid>(new_args.at(0));
+    return std::make_shared<Sigmoid>(new_args.at(0));
 }
 
-ov::op::v0::Sigmoid::Sigmoid(const Output<Node>& arg) : UnaryElementwiseArithmetic(arg) {
+Sigmoid::Sigmoid(const Output<Node>& arg) : UnaryElementwiseArithmetic(arg) {
     constructor_validate_and_infer_types();
 }
 
-OPENVINO_SUPPRESS_DEPRECATED_START
-namespace sigmoid {
-namespace {
-template <element::Type_t ET>
-inline bool evaluate(const HostTensorPtr& arg0, const HostTensorPtr& out, const size_t count) {
-    using T = typename element_type_traits<ET>::value_type;
-    ov::reference::sigmoid<T>(arg0->get_data_ptr<ET>(), out->get_data_ptr<ET>(), count);
-    return true;
-}
-
-bool evaluate_sigmoid(const HostTensorPtr& arg0, const HostTensorPtr& out) {
-    bool rc = true;
-    size_t count = shape_size(arg0->get_shape());
-    out->set_unary(arg0);
+bool Sigmoid::evaluate(TensorVector& outputs, const TensorVector& inputs) const {
+    OV_OP_SCOPE(v0_Sigmoid_evaluate);
+    OPENVINO_ASSERT(outputs.size() == 1);
+    OPENVINO_ASSERT(inputs.size() == 1);
 
-    switch (arg0->get_element_type()) {
-        OPENVINO_TYPE_CASE(evaluate_sigmoid, i32, arg0, out, count);
-        OPENVINO_TYPE_CASE(evaluate_sigmoid, i64, arg0, out, count);
-        OPENVINO_TYPE_CASE(evaluate_sigmoid, u32, arg0, out, count);
-        OPENVINO_TYPE_CASE(evaluate_sigmoid, u64, arg0, out, count);
-        OPENVINO_TYPE_CASE(evaluate_sigmoid, f16, arg0, out, count);
-        OPENVINO_TYPE_CASE(evaluate_sigmoid, f32, arg0, out, count);
-    default:
-        rc = false;
-        break;
-    }
-    return rc;
-}
-}  // namespace
-}  // namespace sigmoid
+    const auto& in_shape = inputs[0].get_shape();
+    outputs[0].set_shape(in_shape);
 
-bool ov::op::v0::Sigmoid::evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const {
-    OV_OP_SCOPE(v0_Sigmoid_evaluate);
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    OPENVINO_ASSERT(validate_host_tensor_vector(outputs, 1) && validate_host_tensor_vector(inputs, 1));
-    OPENVINO_SUPPRESS_DEPRECATED_END
-    return sigmoid::evaluate_sigmoid(inputs[0], outputs[0]);
+    using namespace ov::element;
+    return IfTypeOf<f16, f32, i32, i64, u32, u64>::apply<sigmoid::Evaluate>(inputs[0].get_element_type(),
+                                                                            inputs[0],
+                                                                            outputs[0],
+                                                                            shape_size(in_shape));
 }
 
-bool ov::op::v0::Sigmoid::has_evaluate() const {
+bool Sigmoid::has_evaluate() const {
     OV_OP_SCOPE(v0_Sigmoid_has_evaluate);
     switch (get_input_element_type(0)) {
-    case ngraph::element::i32:
-    case ngraph::element::i64:
-    case ngraph::element::u32:
-    case ngraph::element::u64:
-    case ngraph::element::f16:
-    case ngraph::element::f32:
+    case element::f16:
+    case element::f32:
+    case element::i32:
+    case element::i64:
+    case element::u32:
+    case element::u64:
         return true;
     default:
-        break;
+        return false;
     }
-    return false;
 }
+}  // namespace v0
+}  // namespace op
+}  // namespace ov

From 57571d36e6c9717d5f73dfb54cbe5b8ff4fa8361 Mon Sep 17 00:00:00 2001
From: Nikolay Shchegolev <nikolay.shchegolev@intel.com>
Date: Tue, 31 Oct 2023 16:10:52 +0400
Subject: [PATCH 6/9] [CPU] NMSRotated operation implementation. (#20410)

---
 .../sort/NMSRotated_13.md                     |    4 +-
 .../src/transformations/convert_precision.cpp |   47 +
 src/plugins/intel_cpu/src/cpu_types.cpp       |    1 +
 src/plugins/intel_cpu/src/node.cpp            |   37 +-
 src/plugins/intel_cpu/src/node.h              |    1 +
 .../nodes/kernels/x64/non_max_suppression.cpp |  465 ++++++
 .../nodes/kernels/x64/non_max_suppression.hpp |  152 ++
 .../src/nodes/non_max_suppression.cpp         | 1467 +++++++----------
 .../intel_cpu/src/nodes/non_max_suppression.h |  172 +-
 .../skip_tests_config.cpp                     |    2 +
 .../instances/common/nms_rotated.cpp          |   95 ++
 .../non_max_suppression.cpp                   |   30 +-
 .../include/single_op_tests/nms_rotated.hpp   |   15 +
 .../single_op/nms_rotated.hpp                 |   47 +
 .../src/single_op/nms_rotated.cpp             |  207 +++
 .../skip_configs/CPU/expected_failures_OP.csv |    1 -
 16 files changed, 1792 insertions(+), 951 deletions(-)
 create mode 100644 src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.cpp
 create mode 100644 src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.hpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/nms_rotated.cpp
 create mode 100644 src/tests/functional/plugin/shared/include/single_op_tests/nms_rotated.hpp
 create mode 100644 src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/nms_rotated.hpp
 create mode 100644 src/tests/functional/shared_test_classes/src/single_op/nms_rotated.cpp

diff --git a/docs/articles_en/documentation/openvino_ir/operation_sets/operations_specifications/sort/NMSRotated_13.md b/docs/articles_en/documentation/openvino_ir/operation_sets/operations_specifications/sort/NMSRotated_13.md
index 5ae29954802563..964f9bdb522380 100644
--- a/docs/articles_en/documentation/openvino_ir/operation_sets/operations_specifications/sort/NMSRotated_13.md
+++ b/docs/articles_en/documentation/openvino_ir/operation_sets/operations_specifications/sort/NMSRotated_13.md
@@ -27,13 +27,13 @@ The general algorithm is described below:
 
 Here ``func(rotated_iou(b_i, b)) = 1 if rotated_iou(b_i, b) <= iou_threshold else 0``.
 
-Having two bouding boxes ``B1`` and ``B2`` the following steps are performed to calculate ``rotated_iou(B1, B2)``:
+Having two bounding boxes ``B1`` and ``B2`` the following steps are performed to calculate ``rotated_iou(B1, B2)``:
 
 1. Calculate rotated vertices, (x, y) coordinates of the 4 corners of each box transformed by the corresponding angle in radians according to the direction specified by the *clockwise* attribute.
 2. Find all intersection points between edges of ``B1`` and ``B2``. Add them to the ``intersection_points``.
 3. Find all corners of ``B1`` within area of ``B2``, and all corners of ``B2`` within area of ``B1``. Add them to the ``intersection_points``.
 4. Calculate ``intersection_area`` of the polygon described by ``intersection_points`` (see Sholeace formula).
-5. Calculate ``union_area`` (the common area of ``B1`` and ``B2``), `union_area = (B1_area + B2_area) - intersection_area`.
+5. Calculate ``union_area`` (the common area of ``B1`` and ``B2``), `union_area = B1_area + B2_area`.
 6. Return intersection over union ``rotated_iou = intersection_area / (union_area - intersection_area)``.
 
 
diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp
index a1e9dd7a820e16..4fd52934dd415f 100644
--- a/src/common/transformations/src/transformations/convert_precision.cpp
+++ b/src/common/transformations/src/transformations/convert_precision.cpp
@@ -49,6 +49,7 @@ bool fuse_type_to_nms3(const std::shared_ptr<ov::Node>& node, const precisions_m
 bool fuse_type_to_nms4(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
 bool fuse_type_to_nms5(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
 bool fuse_type_to_nms9(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
+bool fuse_type_to_nms_rotated(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
 bool fuse_type_to_matrix_nms(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
 bool fuse_type_to_multiclass_nms(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
 bool fuse_type_to_generate_proposals(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions);
@@ -383,6 +384,7 @@ bool ov::pass::ConvertPrecision::run_on_model(const std::shared_ptr<ov::Model>&
         {opset4::NonMaxSuppression::get_type_info_static(), fuse_type_to_nms4},
         {opset5::NonMaxSuppression::get_type_info_static(), fuse_type_to_nms5},
         {opset9::NonMaxSuppression::get_type_info_static(), fuse_type_to_nms9},
+        {op::v13::NMSRotated::get_type_info_static(), fuse_type_to_nms_rotated},
         {opset8::MatrixNms::get_type_info_static(), fuse_type_to_matrix_nms},
         {opset8::MulticlassNms::get_type_info_static(), fuse_type_to_multiclass_nms},
         {opset9::MulticlassNms::get_type_info_static(), fuse_type_to_multiclass_nms},
@@ -691,6 +693,51 @@ bool fuse_type_to_nms9(const std::shared_ptr<ov::Node>& node, const precisions_m
     return res;
 }
 
+bool fuse_type_to_nms_rotated(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions) {
+    auto nms = ov::as_type_ptr<op::v13::NMSRotated>(node);
+    if (!nms) {
+        return false;
+    }
+
+    bool res = false;
+    auto it = precisions.find(node->get_output_element_type(0));
+    if (it != precisions.end()) {
+        const auto& to = it->second;
+        if (to == ov::element::i32 || to == ov::element::i64) {
+            nms->set_output_type_attr(to);
+            res = true;
+            if (precisions.count(node->get_output_element_type(1)) == 0) {
+                return res;
+            }
+        }
+    }
+
+    auto type_relaxed = std::dynamic_pointer_cast<ov::op::TypeRelaxedBase>(node);
+    ov::element::TypeVector output_types;
+    for (size_t i = 0; i < node->get_output_size(); i++) {
+        it = precisions.find(node->get_output_element_type(i));
+        if (it == precisions.end()) {
+            output_types.push_back(node->get_output_element_type(i));
+            continue;
+        }
+        const auto& to = it->second;
+        if (type_relaxed) {
+            type_relaxed->set_overridden_output_type(to, i);
+            res = true;
+        }
+        output_types.push_back(to);
+    }
+
+    if (!type_relaxed) {
+        auto relaxed_op =
+            std::make_shared<ov::op::TypeRelaxed<op::v13::NMSRotated>>(*nms, ov::element::TypeVector{}, output_types);
+        replace_node(node, relaxed_op);
+        res = true;
+    }
+
+    return res;
+}
+
 namespace {
 
 bool update_type(size_t idx,
diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp
index 139685f5882103..56cdbe32a2da4e 100644
--- a/src/plugins/intel_cpu/src/cpu_types.cpp
+++ b/src/plugins/intel_cpu/src/cpu_types.cpp
@@ -201,6 +201,7 @@ static const TypeToNameMap& get_type_to_name_tbl() {
             { "ExtractImagePatches", Type::ExtractImagePatches},
             { "NonMaxSuppression", Type::NonMaxSuppression},
             { "NonMaxSuppressionIEInternal", Type::NonMaxSuppression},
+            { "NMSRotated", Type::NonMaxSuppression},
             { "MatrixNms", Type::MatrixNms},
             { "MulticlassNms", Type::MulticlassNms},
             { "MulticlassNmsIEInternal", Type::MulticlassNms},
diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
index ab02ae44dd6ce2..c36815ee048091 100644
--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@@ -615,26 +615,31 @@ bool Node::outputShapeDataDependency() const {
 
 void Node::redefineOutputMemory(const std::vector<VectorDims> &newOutputShapes) {
     if (newOutputShapes.size() != outputShapes.size()) {
-        IE_THROW() << "Number shapes mismatch with real outputs number for node with name: " << getName();
+        THROW_CPU_NODE_ERR("has shapes number mismatch with real outputs number.");
     }
-    for (size_t i = 0; i < outputShapes.size(); i++) {
-        const auto edges = getChildEdgesAtPort(i);
+    for (size_t i = 0lu; i < outputShapes.size(); i++) {
+        redefineOutputMemory(i, newOutputShapes[i]);
+    }
+}
 
-        // avoid 0D shape incompatible
-        auto newOutputShape = newOutputShapes[i];
-        if (newOutputShape.empty()) {
-            newOutputShape.push_back(1);
-        }
+void Node::redefineOutputMemory(const size_t port, const VectorDims& new_output_shape) {
+    const auto edges = getChildEdgesAtPort(port);
 
-        const auto &currDesc = edges[0]->getMemory().getDesc();
-        if (currDesc.getShape().isStatic() && currDesc.getShape().getStaticDims() == newOutputShape)
-            continue;
+    // avoid 0D shape incompatible
+    auto new_shape = new_output_shape;
+    if (new_shape.empty()) {
+        new_shape.push_back(1);
+    }
 
-        const bool hasZeroDims = std::count(std::begin(newOutputShape), std::end(newOutputShape), 0) > 0;
-        const auto memDesc = getBaseMemDescAtOutputPort(i)->cloneWithNewDims(newOutputShape, hasZeroDims);
-        for (size_t j = 0; j < edges.size(); j++) {
-            edges[j]->getMemoryPtr()->redefineDesc(memDesc);
-        }
+    const auto& curr_desc = edges[0]->getMemory().getDesc();
+    if (curr_desc.getShape().isStatic() && curr_desc.getShape().getStaticDims() == new_shape) {
+        return;
+    }
+
+    const bool has_zero_dims = std::count(std::begin(new_shape), std::end(new_shape), 0lu) > 0;
+    const auto mem_desc = getBaseMemDescAtOutputPort(port)->cloneWithNewDims(new_shape, has_zero_dims);
+    for (size_t j = 0lu; j < edges.size(); j++) {
+        edges[j]->getMemoryPtr()->redefineDesc(mem_desc);
     }
 }
 
diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h
index 864c08a95b04c6..4b6fa3a87f72dd 100644
--- a/src/plugins/intel_cpu/src/node.h
+++ b/src/plugins/intel_cpu/src/node.h
@@ -366,6 +366,7 @@ class Node {
     void updateDynamicParams();
     void executeDynamic(dnnl::stream strm);
     virtual void redefineOutputMemory(const std::vector<VectorDims> &newShapes);
+    void redefineOutputMemory(const size_t port, const VectorDims& new_output_shape);
     bool outputShapeDataDependency() const;
 
     virtual void initSupportedPrimitiveDescriptors();
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.cpp
new file mode 100644
index 00000000000000..f9c665ec9c5eea
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.cpp
@@ -0,0 +1,465 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "non_max_suppression.hpp"
+#include "utils/general_utils.h"
+
+using namespace InferenceEngine;
+using namespace dnnl::impl::cpu;
+
+#define GET_OFF(field) offsetof(NmsCallArgs, field)
+
+namespace ov {
+namespace intel_cpu {
+namespace kernel {
+
+template <x64::cpu_isa_t isa>
+void NonMaxSuppression<isa>::generate() {
+    load_vector_emitter.reset(new jit_load_emitter(this, isa, Precision::FP32, Precision::FP32, vector_step));
+    load_scalar_emitter.reset(new jit_load_emitter(this, isa, Precision::FP32, Precision::FP32, scalar_step));
+
+    exp_injector.reset(new x64::jit_uni_eltwise_injector_f32<isa>(this, dnnl::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.f));
+
+    this->preamble();
+
+    uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+
+    load_pool_gpr_idxs = {static_cast<size_t>(reg_load_store_mask.getIdx()), static_cast<size_t>(reg_load_table.getIdx())};
+    store_pool_gpr_idxs = {static_cast<size_t>(reg_load_store_mask.getIdx())};
+    store_pool_vec_idxs = {static_cast<size_t>(vmm_zero.getIdx())};
+
+    mov(reg_boxes_coord0, ptr[reg_params + GET_OFF(selected_boxes_coord[0])]);
+    mov(reg_boxes_coord1, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 1 * sizeof(size_t)]);
+    mov(reg_boxes_coord2, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 2 * sizeof(size_t)]);
+    mov(reg_boxes_coord3, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 3 * sizeof(size_t)]);
+    mov(reg_candidate_box, ptr[reg_params + GET_OFF(candidate_box)]);
+    mov(reg_candidate_status, ptr[reg_params + GET_OFF(candidate_status)]);
+    mov(reg_boxes_num, ptr[reg_params + GET_OFF(selected_boxes_num)]);
+    mov(reg_iou_threshold, ptr[reg_params + GET_OFF(iou_threshold)]);
+    // soft
+    mov(reg_score_threshold, ptr[reg_params + GET_OFF(score_threshold)]);
+    mov(reg_score, ptr[reg_params + GET_OFF(score)]);
+    mov(reg_scale, ptr[reg_params + GET_OFF(scale)]);
+
+    // could use rcx(reg_table) and rdi(reg_temp) now as abi parse finished
+    mov(reg_table, l_table_constant);
+    if (x64::mayiuse(x64::avx512_core)) {
+        kmovw(k_mask_one, word[reg_table + vlen]);
+    }
+    uni_vbroadcastss(vmm_iou_threshold, ptr[reg_iou_threshold]);
+    uni_vbroadcastss(vmm_score_threshold, ptr[reg_score_threshold]);
+
+    uni_vbroadcastss(vmm_candidate_coord0, ptr[reg_candidate_box]);
+    uni_vbroadcastss(vmm_candidate_coord1, ptr[reg_candidate_box + 1 * sizeof(float)]);
+    uni_vbroadcastss(vmm_candidate_coord2, ptr[reg_candidate_box + 2 * sizeof(float)]);
+    uni_vbroadcastss(vmm_candidate_coord3, ptr[reg_candidate_box + 3 * sizeof(float)]);
+
+    if (m_jcp.box_encode_type == NMSBoxEncodeType::CORNER) {
+        // box format: y1, x1, y2, x2
+        uni_vminps(vmm_temp1, vmm_candidate_coord0, vmm_candidate_coord2);
+        uni_vmaxps(vmm_temp2, vmm_candidate_coord0, vmm_candidate_coord2);
+        uni_vmovups(vmm_candidate_coord0, vmm_temp1);
+        uni_vmovups(vmm_candidate_coord2, vmm_temp2);
+
+        uni_vminps(vmm_temp1, vmm_candidate_coord1, vmm_candidate_coord3);
+        uni_vmaxps(vmm_temp2, vmm_candidate_coord1, vmm_candidate_coord3);
+        uni_vmovups(vmm_candidate_coord1, vmm_temp1);
+        uni_vmovups(vmm_candidate_coord3, vmm_temp2);
+    } else {
+        // box format: x_center, y_center, width, height --> y1, x1, y2, x2
+        uni_vmulps(vmm_temp1, vmm_candidate_coord2, ptr[reg_table]);   // width/2
+        uni_vmulps(vmm_temp2, vmm_candidate_coord3, ptr[reg_table]);   // height/2
+
+        uni_vaddps(vmm_temp3, vmm_candidate_coord0, vmm_temp1);  // x_center + width/2
+        uni_vmovups(vmm_candidate_coord3, vmm_temp3);
+
+        uni_vaddps(vmm_temp3, vmm_candidate_coord1, vmm_temp2);  // y_center + height/2
+        uni_vmovups(vmm_candidate_coord2, vmm_temp3);
+
+        uni_vsubps(vmm_temp3, vmm_candidate_coord0, vmm_temp1);  // x_center - width/2
+        uni_vsubps(vmm_temp4, vmm_candidate_coord1, vmm_temp2);  // y_center - height/2
+
+        uni_vmovups(vmm_candidate_coord1, vmm_temp3);
+        uni_vmovups(vmm_candidate_coord0, vmm_temp4);
+    }
+
+    // check from last to first
+    imul(reg_temp_64, reg_boxes_num, sizeof(float));
+    add(reg_boxes_coord0, reg_temp_64);  // y1
+    add(reg_boxes_coord1, reg_temp_64);  // x1
+    add(reg_boxes_coord2, reg_temp_64);  // y2
+    add(reg_boxes_coord3, reg_temp_64);  // x2
+
+    Xbyak::Label hard_nms_label;
+    Xbyak::Label nms_end_label;
+
+    mov(reg_temp_32, ptr[reg_scale]);
+    test(reg_temp_32, reg_temp_32);
+    jz(hard_nms_label, T_NEAR);
+
+    soft_nms();
+
+    jmp(nms_end_label, T_NEAR);
+
+    L(hard_nms_label);
+
+    hard_nms();
+
+    L(nms_end_label);
+
+    this->postamble();
+
+    load_vector_emitter->emit_data();
+    load_scalar_emitter->emit_data();
+
+    prepare_table();
+    exp_injector->prepare_table();
+}
+
+
+template <x64::cpu_isa_t isa>
+void NonMaxSuppression<isa>::hard_nms() {
+    Xbyak::Label main_loop_label_hard;
+    Xbyak::Label main_loop_end_label_hard;
+    Xbyak::Label tail_loop_label_hard;
+    Xbyak::Label terminate_label_hard;
+    L(main_loop_label_hard);
+    {
+        cmp(reg_boxes_num, vector_step);
+        jl(main_loop_end_label_hard, T_NEAR);
+
+        sub(reg_boxes_coord0, vector_step * sizeof(float));
+        sub(reg_boxes_coord1, vector_step * sizeof(float));
+        sub(reg_boxes_coord2, vector_step * sizeof(float));
+        sub(reg_boxes_coord3, vector_step * sizeof(float));
+
+        // iou result is in vmm_temp3
+        iou(vector_step);
+
+        sub(reg_boxes_num, vector_step);
+
+        suppressed_by_iou(false);
+
+        // if zero continue, else set result to suppressed and terminate
+        jz(main_loop_label_hard, T_NEAR);
+
+        uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0);
+
+        jmp(terminate_label_hard, T_NEAR);
+    }
+    L(main_loop_end_label_hard);
+
+    L(tail_loop_label_hard);
+    {
+        cmp(reg_boxes_num, 1);
+        jl(terminate_label_hard, T_NEAR);
+
+        sub(reg_boxes_coord0, scalar_step * sizeof(float));
+        sub(reg_boxes_coord1, scalar_step * sizeof(float));
+        sub(reg_boxes_coord2, scalar_step * sizeof(float));
+        sub(reg_boxes_coord3, scalar_step * sizeof(float));
+
+        // iou result is in vmm_temp3
+        iou(scalar_step);
+
+        sub(reg_boxes_num, scalar_step);
+
+        suppressed_by_iou(true);
+
+        jz(tail_loop_label_hard, T_NEAR);
+
+        uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0);
+
+        jmp(terminate_label_hard, T_NEAR);
+    }
+
+    L(terminate_label_hard);
+}
+
+template <x64::cpu_isa_t isa>
+void NonMaxSuppression<isa>::soft_nms() {
+    uni_vbroadcastss(vmm_scale, ptr[reg_scale]);
+
+    Xbyak::Label main_loop_label;
+    Xbyak::Label main_loop_end_label;
+    Xbyak::Label tail_loop_label;
+    Xbyak::Label terminate_label;
+
+    Xbyak::Label main_loop_label_soft;
+    Xbyak::Label tail_loop_label_soft;
+    L(main_loop_label);
+    {
+        cmp(reg_boxes_num, vector_step);
+        jl(main_loop_end_label, T_NEAR);
+
+        sub(reg_boxes_coord0, vector_step * sizeof(float));
+        sub(reg_boxes_coord1, vector_step * sizeof(float));
+        sub(reg_boxes_coord2, vector_step * sizeof(float));
+        sub(reg_boxes_coord3, vector_step * sizeof(float));
+
+        // result(iou and weight) is in vmm_temp3
+        iou(vector_step);
+        sub(reg_boxes_num, vector_step);
+
+        // soft suppressed by iou_threshold
+        if (m_jcp.is_soft_suppressed_by_iou) {
+            suppressed_by_iou(false);
+
+            // if zero continue soft suppression, else set result to suppressed and terminate
+            jz(main_loop_label_soft, T_NEAR);
+
+            uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0);
+
+            jmp(terminate_label, T_NEAR);
+
+            L(main_loop_label_soft);
+        }
+
+        // weight: std::exp(scale * iou * iou)
+        soft_coeff();
+
+        // vector weights multiply
+        horizontal_mul();
+
+        uni_vbroadcastss(vmm_temp1, ptr[reg_score]);
+
+        // new score in vmm3[0]
+        uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp1);
+        // store new score
+        uni_vmovss(ptr[reg_score], vmm_temp3);
+
+        // cmpps(_CMP_LE_OS) if new score is less or equal than score_threshold
+        suppressed_by_score();
+
+        jz(main_loop_label, T_NEAR);
+
+        uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0);
+
+        jmp(terminate_label, T_NEAR);
+    }
+    L(main_loop_end_label);
+
+    L(tail_loop_label);
+    {
+        cmp(reg_boxes_num, 1);
+        jl(terminate_label, T_NEAR);
+
+        sub(reg_boxes_coord0, scalar_step * sizeof(float));
+        sub(reg_boxes_coord1, scalar_step * sizeof(float));
+        sub(reg_boxes_coord2, scalar_step * sizeof(float));
+        sub(reg_boxes_coord3, scalar_step * sizeof(float));
+
+        iou(scalar_step);
+        sub(reg_boxes_num, scalar_step);
+
+        // soft suppressed by iou_threshold
+        if (m_jcp.is_soft_suppressed_by_iou) {
+            suppressed_by_iou(true);
+
+            jz(tail_loop_label_soft, T_NEAR);
+
+            uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0);
+
+            jmp(terminate_label, T_NEAR);
+
+            L(tail_loop_label_soft);
+        }
+
+        soft_coeff();
+
+        uni_vbroadcastss(vmm_temp1, ptr[reg_score]);
+
+        // vmm3[0] is valide, no need horizontal mul.
+        uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp1);
+
+        uni_vmovss(ptr[reg_score], vmm_temp3);
+
+        // cmpps(_CMP_LE_OS) if new score is less or equal than score_threshold
+        suppressed_by_score();
+
+        jz(tail_loop_label, T_NEAR);
+
+        uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0);
+
+        jmp(terminate_label, T_NEAR);
+    }
+
+    L(terminate_label);
+}
+
+template <x64::cpu_isa_t isa>
+void NonMaxSuppression<isa>::suppressed_by_iou(bool is_scalar) {
+    if (x64::mayiuse(x64::avx512_core)) {
+        vcmpps(k_mask, vmm_temp3, vmm_iou_threshold, 0x0D); // _CMP_GE_OS. vcmpps w/ kmask only on V5
+        if (is_scalar)
+            kandw(k_mask, k_mask, k_mask_one);
+        kortestw(k_mask, k_mask);    // bitwise check if all zero
+    } else if (x64::mayiuse(x64::avx)) {
+        // vex instructions with xmm on avx and ymm on avx2
+        vcmpps(vmm_temp4, vmm_temp3, vmm_iou_threshold, 0x0D);  // xmm and ymm only on V1.
+        if (is_scalar) {
+            uni_vpextrd(reg_temp_32, Xbyak::Xmm(vmm_temp4.getIdx()), 0);
+            test(reg_temp_32, reg_temp_32);
+        } else {
+            uni_vtestps(vmm_temp4, vmm_temp4);  // vtestps: sign bit check if all zeros, ymm and xmm only on V1, N/A on V5
+        }
+    } else {
+        // pure sse path, make sure don't spoil vmm_temp3, which may used in after soft-suppression
+        uni_vmovups(vmm_temp4, vmm_temp3);
+        cmpps(vmm_temp4, vmm_iou_threshold, 0x07);  // order compare, 0 for at least one is NaN
+
+        uni_vmovups(vmm_temp2, vmm_temp3);
+        cmpps(vmm_temp2, vmm_iou_threshold, 0x05);   // _CMP_GE_US on sse, no direct _CMP_GE_OS supported.
+
+        uni_vandps(vmm_temp4, vmm_temp4, vmm_temp2);
+        if (is_scalar) {
+            uni_vpextrd(reg_temp_32, Xbyak::Xmm(vmm_temp4.getIdx()), 0);
+            test(reg_temp_32, reg_temp_32);
+        } else {
+            uni_vtestps(vmm_temp4, vmm_temp4);  // ptest: bitwise check if all zeros, on sse41
+        }
+    }
+}
+
+template <x64::cpu_isa_t isa>
+void NonMaxSuppression<isa>::suppressed_by_score() {
+    if (x64::mayiuse(x64::avx512_core)) {
+        vcmpps(k_mask, vmm_temp3, vmm_score_threshold, 0x02); // vcmpps w/ kmask only on V5, w/o kmask version N/A on V5
+        kandw(k_mask, k_mask, k_mask_one);
+        kortestw(k_mask, k_mask);    // bitwise check if all zero
+    } else if (x64::mayiuse(x64::avx)) {
+        vcmpps(vmm_temp4, vmm_temp3, vmm_score_threshold, 0x02);
+        uni_vpextrd(reg_temp_32, Xbyak::Xmm(vmm_temp4.getIdx()), 0);
+        test(reg_temp_32, reg_temp_32);
+    } else {
+        cmpps(vmm_temp3, vmm_score_threshold, 0x02);  // _CMP_LE_OS on sse
+        uni_vpextrd(reg_temp_32, Xbyak::Xmm(vmm_temp3.getIdx()), 0);
+        test(reg_temp_32, reg_temp_32);
+    }
+}
+
+template <x64::cpu_isa_t isa>
+void NonMaxSuppression<isa>::iou(int ele_num) {
+    auto load = [&](Xbyak::Reg64 reg_src, Vmm vmm_dst) {
+        if (ele_num != scalar_step && ele_num != vector_step)
+            OPENVINO_THROW("NMS JIT implementation supports load emitter with only element count scalar_step or vector_step! Get: ", ele_num);
+
+        const auto& load_emitter = ele_num == 1 ? load_scalar_emitter : load_vector_emitter;
+        load_emitter->emit_code({static_cast<size_t>(reg_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())},
+            {}, {load_pool_gpr_idxs});
+    };
+    load(reg_boxes_coord0, vmm_boxes_coord0);
+    load(reg_boxes_coord1, vmm_boxes_coord1);
+    load(reg_boxes_coord2, vmm_boxes_coord2);
+    load(reg_boxes_coord3, vmm_boxes_coord3);
+
+    if (m_jcp.box_encode_type == NMSBoxEncodeType::CORNER) {
+        // box format: y1, x1, y2, x2
+        uni_vminps(vmm_temp1, vmm_boxes_coord0, vmm_boxes_coord2);
+        uni_vmaxps(vmm_temp2, vmm_boxes_coord0, vmm_boxes_coord2);
+        uni_vmovups(vmm_boxes_coord0, vmm_temp1);
+        uni_vmovups(vmm_boxes_coord2, vmm_temp2);
+
+        uni_vminps(vmm_temp1, vmm_boxes_coord1, vmm_boxes_coord3);
+        uni_vmaxps(vmm_temp2, vmm_boxes_coord1, vmm_boxes_coord3);
+        uni_vmovups(vmm_boxes_coord1, vmm_temp1);
+        uni_vmovups(vmm_boxes_coord3, vmm_temp2);
+    } else {
+        // box format: x_center, y_center, width, height --> y1, x1, y2, x2
+        uni_vmulps(vmm_temp1, vmm_boxes_coord2, ptr[reg_table]);   // width/2
+        uni_vmulps(vmm_temp2, vmm_boxes_coord3, ptr[reg_table]);   // height/2
+
+        uni_vaddps(vmm_temp3, vmm_boxes_coord0, vmm_temp1);  // x_center + width/2
+        uni_vmovups(vmm_boxes_coord3, vmm_temp3);
+
+        uni_vaddps(vmm_temp3, vmm_boxes_coord1, vmm_temp2);  // y_center + height/2
+        uni_vmovups(vmm_boxes_coord2, vmm_temp3);
+
+        uni_vsubps(vmm_temp3, vmm_boxes_coord0, vmm_temp1);  // x_center - width/2
+        uni_vsubps(vmm_temp4, vmm_boxes_coord1, vmm_temp2);  // y_center - height/2
+
+        uni_vmovups(vmm_boxes_coord1, vmm_temp3);
+        uni_vmovups(vmm_boxes_coord0, vmm_temp4);
+    }
+
+    uni_vsubps(vmm_temp1, vmm_boxes_coord2, vmm_boxes_coord0);
+    uni_vsubps(vmm_temp2, vmm_boxes_coord3, vmm_boxes_coord1);
+    uni_vmulps(vmm_temp1, vmm_temp1, vmm_temp2);  // boxes area
+
+    uni_vsubps(vmm_temp2, vmm_candidate_coord2, vmm_candidate_coord0);
+    uni_vsubps(vmm_temp3, vmm_candidate_coord3, vmm_candidate_coord1);
+    uni_vmulps(vmm_temp2, vmm_temp2, vmm_temp3);  // candidate(bc) area  // candidate area calculate once and check if 0
+
+    uni_vaddps(vmm_temp1, vmm_temp1, vmm_temp2);  // areaI + areaJ to free vmm_temp2
+
+    // y of intersection
+    uni_vminps(vmm_temp3, vmm_boxes_coord2, vmm_candidate_coord2);  // min(Ymax)
+    uni_vmaxps(vmm_temp4, vmm_boxes_coord0, vmm_candidate_coord0);  // max(Ymin)
+    uni_vsubps(vmm_temp3, vmm_temp3, vmm_temp4);  // min(Ymax) - max(Ymin)
+    uni_vmaxps(vmm_temp3, vmm_temp3, vmm_zero);
+
+    // x of intersection
+    uni_vminps(vmm_temp4, vmm_boxes_coord3, vmm_candidate_coord3);  // min(Xmax)
+    uni_vmaxps(vmm_temp2, vmm_boxes_coord1, vmm_candidate_coord1);  // max(Xmin)
+    uni_vsubps(vmm_temp4, vmm_temp4, vmm_temp2);  // min(Xmax) - max(Xmin)
+    uni_vmaxps(vmm_temp4, vmm_temp4, vmm_zero);
+
+    // intersection_area
+    uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp4);
+
+    // iou: intersection_area / (areaI + areaJ - intersection_area);
+    uni_vsubps(vmm_temp1, vmm_temp1, vmm_temp3);
+    uni_vdivps(vmm_temp3, vmm_temp3, vmm_temp1);
+}
+
+// std::exp(scale * iou * iou)
+template <x64::cpu_isa_t isa>
+void NonMaxSuppression<isa>::soft_coeff() {
+    uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp3);
+    uni_vmulps(vmm_temp3, vmm_temp3, vmm_scale);
+    exp_injector->compute_vector_range(vmm_temp3.getIdx(), vmm_temp3.getIdx() + 1);
+}
+
+template <x64::cpu_isa_t isa>
+void NonMaxSuppression<isa>::horizontal_mul_xmm(const Xbyak::Xmm &xmm_weight, const Xbyak::Xmm &xmm_aux) {
+    uni_vmovshdup(xmm_aux, xmm_weight);              //  weight:1,2,3,4; aux:2,2,4,4
+    uni_vmulps(xmm_weight, xmm_weight, xmm_aux);     //  weight:1*2,2*2,3*4,4*4
+    uni_vmovhlps(xmm_aux, xmm_aux, xmm_weight);      //  aux:3*4,4*4,4,4
+    uni_vmulps(xmm_weight, xmm_weight, xmm_aux);     //  weight:1*2*3*4,...
+}
+
+// horizontal mul for vmm_weight(Vmm(3)), temp1 and temp2 as aux
+template <x64::cpu_isa_t isa>
+inline void NonMaxSuppression<isa>::horizontal_mul() {
+    Xbyak::Xmm xmm_weight = Xbyak::Xmm(vmm_temp3.getIdx());
+    Xbyak::Xmm xmm_temp1 = Xbyak::Xmm(vmm_temp1.getIdx());
+    Xbyak::Xmm xmm_temp2 = Xbyak::Xmm(vmm_temp2.getIdx());
+    if (isa == x64::sse41) {
+        horizontal_mul_xmm(xmm_weight, xmm_temp1);
+    } else if (isa == x64::avx2) {
+        Xbyak::Ymm ymm_weight = Xbyak::Ymm(vmm_temp3.getIdx());
+        vextractf128(xmm_temp1, ymm_weight, 0);
+        vextractf128(xmm_temp2, ymm_weight, 1);
+        uni_vmulps(xmm_weight, xmm_temp1, xmm_temp2);
+        horizontal_mul_xmm(xmm_weight, xmm_temp1);
+    } else {
+        Xbyak::Zmm zmm_weight = Xbyak::Zmm(vmm_temp3.getIdx());
+        vextractf32x4(xmm_temp1, zmm_weight, 0);
+        vextractf32x4(xmm_temp2, zmm_weight, 1);
+        uni_vmulps(xmm_temp1, xmm_temp1, xmm_temp2);
+        vextractf32x4(xmm_temp2, zmm_weight, 2);
+        vextractf32x4(xmm_weight, zmm_weight, 3);
+        uni_vmulps(xmm_weight, xmm_weight, xmm_temp2);
+        uni_vmulps(xmm_weight, xmm_weight, xmm_temp1);
+        horizontal_mul_xmm(xmm_weight, xmm_temp1);
+    }
+}
+
+template class NonMaxSuppression<x64::avx512_core>;
+template class NonMaxSuppression<x64::avx2>;
+template class NonMaxSuppression<x64::sse41>;
+
+}   // namespace kernel
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.hpp
new file mode 100644
index 00000000000000..859f687db8dc14
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.hpp
@@ -0,0 +1,152 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "jit_kernel_base.hpp"
+
+#if defined(OPENVINO_ARCH_X86_64)
+#include "emitters/x64/jit_load_store_emitters.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+#endif // OPENVINO_ARCH_X86_64
+
+namespace ov {
+namespace intel_cpu {
+
+enum class NMSBoxEncodeType {
+    CORNER,
+    CENTER
+};
+
+#if defined(OPENVINO_ARCH_X86_64)
+
+namespace kernel {
+
+struct NmsCompileParams {
+    NMSBoxEncodeType box_encode_type;
+    bool is_soft_suppressed_by_iou;
+};
+
+struct NmsCallArgs {
+    const void* selected_boxes_coord[4];
+    size_t selected_boxes_num;
+    const void* candidate_box;
+    const void* iou_threshold;
+    void* candidate_status;
+    // for soft suppression, score *= scale * iou * iou;
+    const void* score_threshold;
+    const void* scale;
+    void* score;
+};
+
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+class NonMaxSuppression : public JitKernel<NmsCompileParams, NmsCallArgs> {
+public:
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(NonMaxSuppression)
+
+    explicit NonMaxSuppression(const NmsCompileParams& jcp) : JitKernel(jit_name(), jcp, isa) {}
+
+    void generate() override;
+
+private:
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::avx512_core, Xbyak::Zmm,
+                                                         isa == dnnl::impl::cpu::x64::avx2,        Xbyak::Ymm,
+                                                                                                   Xbyak::Xmm>::type;
+    uint32_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen;
+    const int vector_step = vlen / sizeof(float);
+    const int scalar_step = 1;
+
+    Xbyak::Reg64 reg_boxes_coord0 = r8;
+    Xbyak::Reg64 reg_boxes_coord1 = r9;
+    Xbyak::Reg64 reg_boxes_coord2 = r10;
+    Xbyak::Reg64 reg_boxes_coord3 = r11;
+    Xbyak::Reg64 reg_candidate_box = r12;
+    Xbyak::Reg64 reg_candidate_status = r13;
+    Xbyak::Reg64 reg_boxes_num = r14;
+    Xbyak::Reg64 reg_iou_threshold = r15;
+    // more for soft
+    Xbyak::Reg64 reg_score_threshold = rdx;
+    Xbyak::Reg64 reg_score = rbp;
+    Xbyak::Reg64 reg_scale = rsi;
+
+    Xbyak::Reg64 reg_load_table = rax;
+    Xbyak::Reg64 reg_load_store_mask = rbx;
+
+    // reuse
+    Xbyak::Label l_table_constant;
+    Xbyak::Reg64 reg_table = rcx;
+    Xbyak::Reg64 reg_temp_64 = rdi;
+    Xbyak::Reg32 reg_temp_32 = edi;
+
+    const Xbyak::Reg64 reg_params = Xbyak::Reg64(dnnl::impl::cpu::x64::abi_param_regs[0]);
+
+    std::unique_ptr<jit_load_emitter> load_vector_emitter = nullptr;
+    std::unique_ptr<jit_load_emitter> load_scalar_emitter = nullptr;
+
+    std::vector<size_t> store_pool_gpr_idxs;
+    std::vector<size_t> store_pool_vec_idxs;
+    std::vector<size_t> load_pool_gpr_idxs;
+
+    Vmm vmm_boxes_coord0 = Vmm(1);
+    Vmm vmm_boxes_coord1 = Vmm(2);
+    Vmm vmm_boxes_coord2 = Vmm(3);
+    Vmm vmm_boxes_coord3 = Vmm(4);
+    Vmm vmm_candidate_coord0 = Vmm(5);
+    Vmm vmm_candidate_coord1 = Vmm(6);
+    Vmm vmm_candidate_coord2 = Vmm(7);
+    Vmm vmm_candidate_coord3 = Vmm(8);
+    Vmm vmm_temp1 = Vmm(9);
+    Vmm vmm_temp2 = Vmm(10);
+    Vmm vmm_temp3 = Vmm(11);
+    Vmm vmm_temp4 = Vmm(12);
+
+    Vmm vmm_iou_threshold = Vmm(13);
+    Vmm vmm_zero = Vmm(15);
+
+    // soft
+    Vmm vmm_score_threshold = Vmm(14);
+    Vmm vmm_scale = Vmm(0);
+
+    Xbyak::Opmask k_mask = Xbyak::Opmask(7);
+    Xbyak::Opmask k_mask_one = Xbyak::Opmask(6);
+
+    std::shared_ptr<dnnl::impl::cpu::x64::jit_uni_eltwise_injector_f32<isa>> exp_injector;
+
+    inline void hard_nms();
+
+    inline void soft_nms();
+
+    inline void suppressed_by_iou(bool is_scalar);
+
+    inline void suppressed_by_score();
+
+    inline void iou(int ele_num);
+
+    inline void soft_coeff();
+
+    inline void horizontal_mul_xmm(const Xbyak::Xmm& xmm_weight, const Xbyak::Xmm& xmm_aux);
+
+    inline void horizontal_mul();
+
+    inline void prepare_table() {
+        auto broadcast_d = [&](int val) {
+            for (size_t d = 0; d < vlen / sizeof(int); ++d) {
+                dd(val);
+            }
+        };
+
+        align(64);
+        L(l_table_constant);
+        broadcast_d(0x3f000000);   // 0.5f
+        dw(0x0001);
+    }
+};
+
+}   // namespace kernel
+
+#endif // OPENVINO_ARCH_X86_64
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp
index d2a46ac97da017..79112a3afa34a7 100644
--- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp
+++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp
@@ -1,571 +1,41 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+// Copyright (c) Facebook, Inc. and its affiliates.
+// The implementation for rotated boxes intersection is based on the code from:
+// https://github.com/facebookresearch/detectron2/blob/v0.6/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+//
 
-#include <cmath>
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <utility>
-#include <queue>
 
 #include "non_max_suppression.h"
+
 #include "ie_parallel.hpp"
-#include <ngraph/opsets/opset5.hpp>
-#include <ov_ops/nms_ie_internal.hpp>
 #include "utils/general_utils.h"
+#include "shape_inference/shape_inference_internal_dyn.hpp"
+#include "openvino/op/nms_rotated.hpp"
+#include "openvino/op/non_max_suppression.hpp"
+#include "ov_ops/nms_ie_internal.hpp"
 
-#include "cpu/x64/jit_generator.hpp"
-#include "emitters/x64/jit_load_store_emitters.hpp"
-#include <cpu/x64/injectors/jit_uni_eltwise_injector.hpp>
-#include <shape_inference/shape_inference_internal_dyn.hpp>
+#include <queue>
 
 using namespace InferenceEngine;
-using namespace dnnl;
-using namespace dnnl::impl;
-using namespace dnnl::impl::cpu::x64;
-using namespace dnnl::impl::utils;
-using namespace Xbyak;
-
-#define GET_OFF(field) offsetof(jit_nms_args, field)
 
 namespace ov {
 namespace intel_cpu {
 namespace node {
 
-#if defined(OPENVINO_ARCH_X86_64)
-template <cpu_isa_t isa>
-struct jit_uni_nms_kernel_f32 : public jit_uni_nms_kernel, public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_nms_kernel_f32)
-
-    explicit jit_uni_nms_kernel_f32(jit_nms_config_params jcp_) : jit_uni_nms_kernel(jcp_), jit_generator(jit_name()) {}
-
-    void create_ker() override {
-        jit_generator::create_kernel();
-        ker_ = (decltype(ker_))jit_ker();
-    }
-
-    void generate() override {
-        load_vector_emitter.reset(new jit_load_emitter(this, isa, Precision::FP32, Precision::FP32, vector_step));
-        load_scalar_emitter.reset(new jit_load_emitter(this, isa, Precision::FP32, Precision::FP32, scalar_step));
-
-        exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, dnnl::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f));
-
-        this->preamble();
-
-        uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
-
-        load_pool_gpr_idxs = {static_cast<size_t>(reg_load_store_mask.getIdx()), static_cast<size_t>(reg_load_table.getIdx())};
-        store_pool_gpr_idxs = {static_cast<size_t>(reg_load_store_mask.getIdx())};
-        store_pool_vec_idxs = {static_cast<size_t>(vmm_zero.getIdx())};
-
-        mov(reg_boxes_coord0, ptr[reg_params + GET_OFF(selected_boxes_coord[0])]);
-        mov(reg_boxes_coord1, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 1 * sizeof(size_t)]);
-        mov(reg_boxes_coord2, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 2 * sizeof(size_t)]);
-        mov(reg_boxes_coord3, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 3 * sizeof(size_t)]);
-        mov(reg_candidate_box, ptr[reg_params + GET_OFF(candidate_box)]);
-        mov(reg_candidate_status, ptr[reg_params + GET_OFF(candidate_status)]);
-        mov(reg_boxes_num, ptr[reg_params + GET_OFF(selected_boxes_num)]);
-        mov(reg_iou_threshold, ptr[reg_params + GET_OFF(iou_threshold)]);
-        // soft
-        mov(reg_score_threshold, ptr[reg_params + GET_OFF(score_threshold)]);
-        mov(reg_score, ptr[reg_params + GET_OFF(score)]);
-        mov(reg_scale, ptr[reg_params + GET_OFF(scale)]);
-
-        // could use rcx(reg_table) and rdi(reg_temp) now as abi parse finished
-        mov(reg_table, l_table_constant);
-        if (mayiuse(cpu::x64::avx512_core)) {
-            kmovw(k_mask_one, word[reg_table + vlen]);
-        }
-        uni_vbroadcastss(vmm_iou_threshold, ptr[reg_iou_threshold]);
-        uni_vbroadcastss(vmm_score_threshold, ptr[reg_score_threshold]);
-
-        uni_vbroadcastss(vmm_candidate_coord0, ptr[reg_candidate_box]);
-        uni_vbroadcastss(vmm_candidate_coord1, ptr[reg_candidate_box + 1 * sizeof(float)]);
-        uni_vbroadcastss(vmm_candidate_coord2, ptr[reg_candidate_box + 2 * sizeof(float)]);
-        uni_vbroadcastss(vmm_candidate_coord3, ptr[reg_candidate_box + 3 * sizeof(float)]);
-
-        if (jcp.box_encode_type == NMSBoxEncodeType::CORNER) {
-            // box format: y1, x1, y2, x2
-            uni_vminps(vmm_temp1, vmm_candidate_coord0, vmm_candidate_coord2);
-            uni_vmaxps(vmm_temp2, vmm_candidate_coord0, vmm_candidate_coord2);
-            uni_vmovups(vmm_candidate_coord0, vmm_temp1);
-            uni_vmovups(vmm_candidate_coord2, vmm_temp2);
-
-            uni_vminps(vmm_temp1, vmm_candidate_coord1, vmm_candidate_coord3);
-            uni_vmaxps(vmm_temp2, vmm_candidate_coord1, vmm_candidate_coord3);
-            uni_vmovups(vmm_candidate_coord1, vmm_temp1);
-            uni_vmovups(vmm_candidate_coord3, vmm_temp2);
-        } else {
-            // box format: x_center, y_center, width, height --> y1, x1, y2, x2
-            uni_vmulps(vmm_temp1, vmm_candidate_coord2, ptr[reg_table]);   // width/2
-            uni_vmulps(vmm_temp2, vmm_candidate_coord3, ptr[reg_table]);   // height/2
-
-            uni_vaddps(vmm_temp3, vmm_candidate_coord0, vmm_temp1);  // x_center + width/2
-            uni_vmovups(vmm_candidate_coord3, vmm_temp3);
-
-            uni_vaddps(vmm_temp3, vmm_candidate_coord1, vmm_temp2);  // y_center + height/2
-            uni_vmovups(vmm_candidate_coord2, vmm_temp3);
-
-            uni_vsubps(vmm_temp3, vmm_candidate_coord0, vmm_temp1);  // x_center - width/2
-            uni_vsubps(vmm_temp4, vmm_candidate_coord1, vmm_temp2);  // y_center - height/2
-
-            uni_vmovups(vmm_candidate_coord1, vmm_temp3);
-            uni_vmovups(vmm_candidate_coord0, vmm_temp4);
-        }
-
-        // check from last to first
-        imul(reg_temp_64, reg_boxes_num, sizeof(float));
-        add(reg_boxes_coord0, reg_temp_64);  // y1
-        add(reg_boxes_coord1, reg_temp_64);  // x1
-        add(reg_boxes_coord2, reg_temp_64);  // y2
-        add(reg_boxes_coord3, reg_temp_64);  // x2
-
-        Xbyak::Label hard_nms_label;
-        Xbyak::Label nms_end_label;
-
-        mov(reg_temp_32, ptr[reg_scale]);
-        test(reg_temp_32, reg_temp_32);
-        jz(hard_nms_label, T_NEAR);
-
-        soft_nms();
-
-        jmp(nms_end_label, T_NEAR);
-
-        L(hard_nms_label);
-
-        hard_nms();
-
-        L(nms_end_label);
-
-        this->postamble();
-
-        load_vector_emitter->emit_data();
-        load_scalar_emitter->emit_data();
-
-        prepare_table();
-        exp_injector->prepare_table();
-    }
-
-private:
-    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
-    uint32_t vlen = cpu_isa_traits<isa>::vlen;
-    const int vector_step = vlen / sizeof(float);
-    const int scalar_step = 1;
-
-    Xbyak::Reg64 reg_boxes_coord0 = r8;
-    Xbyak::Reg64 reg_boxes_coord1 = r9;
-    Xbyak::Reg64 reg_boxes_coord2 = r10;
-    Xbyak::Reg64 reg_boxes_coord3 = r11;
-    Xbyak::Reg64 reg_candidate_box = r12;
-    Xbyak::Reg64 reg_candidate_status = r13;
-    Xbyak::Reg64 reg_boxes_num = r14;
-    Xbyak::Reg64 reg_iou_threshold = r15;
-    // more for soft
-    Xbyak::Reg64 reg_score_threshold = rdx;
-    Xbyak::Reg64 reg_score = rbp;
-    Xbyak::Reg64 reg_scale = rsi;
-
-    Xbyak::Reg64 reg_load_table = rax;
-    Xbyak::Reg64 reg_load_store_mask = rbx;
-
-    // reuse
-    Xbyak::Label l_table_constant;
-    Xbyak::Reg64 reg_table = rcx;
-    Xbyak::Reg64 reg_temp_64 = rdi;
-    Xbyak::Reg32 reg_temp_32 = edi;
-
-    Xbyak::Reg64 reg_params = abi_param1;
-
-    std::unique_ptr<jit_load_emitter> load_vector_emitter = nullptr;
-    std::unique_ptr<jit_load_emitter> load_scalar_emitter = nullptr;
-
-    std::vector<size_t> store_pool_gpr_idxs;
-    std::vector<size_t> store_pool_vec_idxs;
-    std::vector<size_t> load_pool_gpr_idxs;
-
-    Vmm vmm_boxes_coord0 = Vmm(1);
-    Vmm vmm_boxes_coord1 = Vmm(2);
-    Vmm vmm_boxes_coord2 = Vmm(3);
-    Vmm vmm_boxes_coord3 = Vmm(4);
-    Vmm vmm_candidate_coord0 = Vmm(5);
-    Vmm vmm_candidate_coord1 = Vmm(6);
-    Vmm vmm_candidate_coord2 = Vmm(7);
-    Vmm vmm_candidate_coord3 = Vmm(8);
-    Vmm vmm_temp1 = Vmm(9);
-    Vmm vmm_temp2 = Vmm(10);
-    Vmm vmm_temp3 = Vmm(11);
-    Vmm vmm_temp4 = Vmm(12);
-
-    Vmm vmm_iou_threshold = Vmm(13);
-    Vmm vmm_zero = Vmm(15);
-
-    // soft
-    Vmm vmm_score_threshold = Vmm(14);
-    Vmm vmm_scale = Vmm(0);
-
-    Xbyak::Opmask k_mask = Xbyak::Opmask(7);
-    Xbyak::Opmask k_mask_one = Xbyak::Opmask(6);
-
-    std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
-
-    inline void hard_nms() {
-        Xbyak::Label main_loop_label_hard;
-        Xbyak::Label main_loop_end_label_hard;
-        Xbyak::Label tail_loop_label_hard;
-        Xbyak::Label terminate_label_hard;
-        L(main_loop_label_hard);
-        {
-            cmp(reg_boxes_num, vector_step);
-            jl(main_loop_end_label_hard, T_NEAR);
-
-            sub(reg_boxes_coord0, vector_step * sizeof(float));
-            sub(reg_boxes_coord1, vector_step * sizeof(float));
-            sub(reg_boxes_coord2, vector_step * sizeof(float));
-            sub(reg_boxes_coord3, vector_step * sizeof(float));
-
-            // iou result is in vmm_temp3
-            iou(vector_step);
-
-            sub(reg_boxes_num, vector_step);
-
-            suppressed_by_iou(false);
-
-            // if zero continue, else set result to suppressed and terminate
-            jz(main_loop_label_hard, T_NEAR);
-
-            uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0);
-
-            jmp(terminate_label_hard, T_NEAR);
-        }
-        L(main_loop_end_label_hard);
-
-        L(tail_loop_label_hard);
-        {
-            cmp(reg_boxes_num, 1);
-            jl(terminate_label_hard, T_NEAR);
-
-            sub(reg_boxes_coord0, scalar_step * sizeof(float));
-            sub(reg_boxes_coord1, scalar_step * sizeof(float));
-            sub(reg_boxes_coord2, scalar_step * sizeof(float));
-            sub(reg_boxes_coord3, scalar_step * sizeof(float));
-
-            // iou result is in vmm_temp3
-            iou(scalar_step);
-
-            sub(reg_boxes_num, scalar_step);
-
-            suppressed_by_iou(true);
-
-            jz(tail_loop_label_hard, T_NEAR);
-
-            uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0);
-
-            jmp(terminate_label_hard, T_NEAR);
-        }
-
-        L(terminate_label_hard);
-    }
-
-    inline void soft_nms() {
-        uni_vbroadcastss(vmm_scale, ptr[reg_scale]);
-
-        Xbyak::Label main_loop_label;
-        Xbyak::Label main_loop_end_label;
-        Xbyak::Label tail_loop_label;
-        Xbyak::Label terminate_label;
-
-        Xbyak::Label main_loop_label_soft;
-        Xbyak::Label tail_loop_label_soft;
-        L(main_loop_label);
-        {
-            cmp(reg_boxes_num, vector_step);
-            jl(main_loop_end_label, T_NEAR);
-
-            sub(reg_boxes_coord0, vector_step * sizeof(float));
-            sub(reg_boxes_coord1, vector_step * sizeof(float));
-            sub(reg_boxes_coord2, vector_step * sizeof(float));
-            sub(reg_boxes_coord3, vector_step * sizeof(float));
-
-            // result(iou and weight) is in vmm_temp3
-            iou(vector_step);
-            sub(reg_boxes_num, vector_step);
-
-            // soft suppressed by iou_threshold
-            if (jcp.is_soft_suppressed_by_iou) {
-                suppressed_by_iou(false);
-
-                // if zero continue soft suppression, else set result to suppressed and terminate
-                jz(main_loop_label_soft, T_NEAR);
-
-                uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0);
-
-                jmp(terminate_label, T_NEAR);
-
-                L(main_loop_label_soft);
-            }
-
-            // weight: std::exp(scale * iou * iou)
-            soft_coeff();
-
-            // vector weights multiply
-            horizontal_mul();
-
-            uni_vbroadcastss(vmm_temp1, ptr[reg_score]);
-
-            // new score in vmm3[0]
-            uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp1);
-            // store new score
-            uni_vmovss(ptr[reg_score], vmm_temp3);
-
-            // cmpps(_CMP_LE_OS) if new score is less or equal than score_threshold
-            suppressed_by_score();
-
-            jz(main_loop_label, T_NEAR);
-
-            uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0);
-
-            jmp(terminate_label, T_NEAR);
-        }
-        L(main_loop_end_label);
-
-        L(tail_loop_label);
-        {
-            cmp(reg_boxes_num, 1);
-            jl(terminate_label, T_NEAR);
-
-            sub(reg_boxes_coord0, scalar_step * sizeof(float));
-            sub(reg_boxes_coord1, scalar_step * sizeof(float));
-            sub(reg_boxes_coord2, scalar_step * sizeof(float));
-            sub(reg_boxes_coord3, scalar_step * sizeof(float));
-
-            iou(scalar_step);
-            sub(reg_boxes_num, scalar_step);
-
-            // soft suppressed by iou_threshold
-            if (jcp.is_soft_suppressed_by_iou) {
-                suppressed_by_iou(true);
-
-                jz(tail_loop_label_soft, T_NEAR);
-
-                uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0);
-
-                jmp(terminate_label, T_NEAR);
-
-                L(tail_loop_label_soft);
-            }
-
-            soft_coeff();
-
-            uni_vbroadcastss(vmm_temp1, ptr[reg_score]);
-
-            // vmm3[0] is valide, no need horizontal mul.
-            uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp1);
-
-            uni_vmovss(ptr[reg_score], vmm_temp3);
-
-            // cmpps(_CMP_LE_OS) if new score is less or equal than score_threshold
-            suppressed_by_score();
-
-            jz(tail_loop_label, T_NEAR);
-
-            uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0);
-
-            jmp(terminate_label, T_NEAR);
-        }
-
-        L(terminate_label);
-    }
-
-    inline void suppressed_by_iou(bool is_scalar) {
-        if (mayiuse(cpu::x64::avx512_core)) {
-            vcmpps(k_mask, vmm_temp3, vmm_iou_threshold, 0x0D); // _CMP_GE_OS. vcmpps w/ kmask only on V5
-            if (is_scalar)
-                kandw(k_mask, k_mask, k_mask_one);
-            kortestw(k_mask, k_mask);    // bitwise check if all zero
-        } else if (mayiuse(cpu::x64::avx)) {
-            // vex instructions with xmm on avx and ymm on avx2
-            vcmpps(vmm_temp4, vmm_temp3, vmm_iou_threshold, 0x0D);  // xmm and ymm only on V1.
-            if (is_scalar) {
-                uni_vpextrd(reg_temp_32, Xmm(vmm_temp4.getIdx()), 0);
-                test(reg_temp_32, reg_temp_32);
-            } else {
-                uni_vtestps(vmm_temp4, vmm_temp4);  // vtestps: sign bit check if all zeros, ymm and xmm only on V1, N/A on V5
-            }
-        } else {
-            // pure sse path, make sure don't spoil vmm_temp3, which may used in after soft-suppression
-            uni_vmovups(vmm_temp4, vmm_temp3);
-            cmpps(vmm_temp4, vmm_iou_threshold, 0x07);  // order compare, 0 for at least one is NaN
-
-            uni_vmovups(vmm_temp2, vmm_temp3);
-            cmpps(vmm_temp2, vmm_iou_threshold, 0x05);   // _CMP_GE_US on sse, no direct _CMP_GE_OS supported.
-
-            uni_vandps(vmm_temp4, vmm_temp4, vmm_temp2);
-            if (is_scalar) {
-                uni_vpextrd(reg_temp_32, Xmm(vmm_temp4.getIdx()), 0);
-                test(reg_temp_32, reg_temp_32);
-            } else {
-                uni_vtestps(vmm_temp4, vmm_temp4);  // ptest: bitwise check if all zeros, on sse41
-            }
-        }
-    }
-
-    inline void suppressed_by_score() {
-        if (mayiuse(cpu::x64::avx512_core)) {
-            vcmpps(k_mask, vmm_temp3, vmm_score_threshold, 0x02); // vcmpps w/ kmask only on V5, w/o kmask version N/A on V5
-            kandw(k_mask, k_mask, k_mask_one);
-            kortestw(k_mask, k_mask);    // bitwise check if all zero
-        } else if (mayiuse(cpu::x64::avx)) {
-            vcmpps(vmm_temp4, vmm_temp3, vmm_score_threshold, 0x02);
-            uni_vpextrd(reg_temp_32, Xmm(vmm_temp4.getIdx()), 0);
-            test(reg_temp_32, reg_temp_32);
-        } else {
-            cmpps(vmm_temp3, vmm_score_threshold, 0x02);  // _CMP_LE_OS on sse
-            uni_vpextrd(reg_temp_32, Xmm(vmm_temp3.getIdx()), 0);
-            test(reg_temp_32, reg_temp_32);
-        }
-    }
-
-    inline void iou(int ele_num) {
-        auto load = [&](Xbyak::Reg64 reg_src, Vmm vmm_dst) {
-            if (ele_num != scalar_step && ele_num != vector_step)
-                IE_THROW() << "NMS JIT implementation supports load emitter with only element count scalar_step or vector_step! Get: " << ele_num;
-
-            const auto& load_emitter = ele_num == 1 ? load_scalar_emitter : load_vector_emitter;
-            load_emitter->emit_code({static_cast<size_t>(reg_src.getIdx())}, {static_cast<size_t>(vmm_dst.getIdx())},
-                {}, {load_pool_gpr_idxs});
-        };
-        load(reg_boxes_coord0, vmm_boxes_coord0);
-        load(reg_boxes_coord1, vmm_boxes_coord1);
-        load(reg_boxes_coord2, vmm_boxes_coord2);
-        load(reg_boxes_coord3, vmm_boxes_coord3);
-
-        if (jcp.box_encode_type == NMSBoxEncodeType::CORNER) {
-            // box format: y1, x1, y2, x2
-            uni_vminps(vmm_temp1, vmm_boxes_coord0, vmm_boxes_coord2);
-            uni_vmaxps(vmm_temp2, vmm_boxes_coord0, vmm_boxes_coord2);
-            uni_vmovups(vmm_boxes_coord0, vmm_temp1);
-            uni_vmovups(vmm_boxes_coord2, vmm_temp2);
-
-            uni_vminps(vmm_temp1, vmm_boxes_coord1, vmm_boxes_coord3);
-            uni_vmaxps(vmm_temp2, vmm_boxes_coord1, vmm_boxes_coord3);
-            uni_vmovups(vmm_boxes_coord1, vmm_temp1);
-            uni_vmovups(vmm_boxes_coord3, vmm_temp2);
-        } else {
-            // box format: x_center, y_center, width, height --> y1, x1, y2, x2
-            uni_vmulps(vmm_temp1, vmm_boxes_coord2, ptr[reg_table]);   // width/2
-            uni_vmulps(vmm_temp2, vmm_boxes_coord3, ptr[reg_table]);   // height/2
-
-            uni_vaddps(vmm_temp3, vmm_boxes_coord0, vmm_temp1);  // x_center + width/2
-            uni_vmovups(vmm_boxes_coord3, vmm_temp3);
-
-            uni_vaddps(vmm_temp3, vmm_boxes_coord1, vmm_temp2);  // y_center + height/2
-            uni_vmovups(vmm_boxes_coord2, vmm_temp3);
-
-            uni_vsubps(vmm_temp3, vmm_boxes_coord0, vmm_temp1);  // x_center - width/2
-            uni_vsubps(vmm_temp4, vmm_boxes_coord1, vmm_temp2);  // y_center - height/2
-
-            uni_vmovups(vmm_boxes_coord1, vmm_temp3);
-            uni_vmovups(vmm_boxes_coord0, vmm_temp4);
-        }
-
-        uni_vsubps(vmm_temp1, vmm_boxes_coord2, vmm_boxes_coord0);
-        uni_vsubps(vmm_temp2, vmm_boxes_coord3, vmm_boxes_coord1);
-        uni_vmulps(vmm_temp1, vmm_temp1, vmm_temp2);  // boxes area
-
-        uni_vsubps(vmm_temp2, vmm_candidate_coord2, vmm_candidate_coord0);
-        uni_vsubps(vmm_temp3, vmm_candidate_coord3, vmm_candidate_coord1);
-        uni_vmulps(vmm_temp2, vmm_temp2, vmm_temp3);  // candidate(bc) area  // candidate area calculate once and check if 0
-
-        uni_vaddps(vmm_temp1, vmm_temp1, vmm_temp2);  // areaI + areaJ to free vmm_temp2
-
-        // y of intersection
-        uni_vminps(vmm_temp3, vmm_boxes_coord2, vmm_candidate_coord2);  // min(Ymax)
-        uni_vmaxps(vmm_temp4, vmm_boxes_coord0, vmm_candidate_coord0);  // max(Ymin)
-        uni_vsubps(vmm_temp3, vmm_temp3, vmm_temp4);  // min(Ymax) - max(Ymin)
-        uni_vmaxps(vmm_temp3, vmm_temp3, vmm_zero);
-
-        // x of intersection
-        uni_vminps(vmm_temp4, vmm_boxes_coord3, vmm_candidate_coord3);  // min(Xmax)
-        uni_vmaxps(vmm_temp2, vmm_boxes_coord1, vmm_candidate_coord1);  // max(Xmin)
-        uni_vsubps(vmm_temp4, vmm_temp4, vmm_temp2);  // min(Xmax) - max(Xmin)
-        uni_vmaxps(vmm_temp4, vmm_temp4, vmm_zero);
-
-        // intersection_area
-        uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp4);
-
-        // iou: intersection_area / (areaI + areaJ - intersection_area);
-        uni_vsubps(vmm_temp1, vmm_temp1, vmm_temp3);
-        uni_vdivps(vmm_temp3, vmm_temp3, vmm_temp1);
-    }
-
-    // std::exp(scale * iou * iou)
-    inline void soft_coeff() {
-        uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp3);
-        uni_vmulps(vmm_temp3, vmm_temp3, vmm_scale);
-        exp_injector->compute_vector_range(vmm_temp3.getIdx(), vmm_temp3.getIdx() + 1);
-    }
-
-    inline void horizontal_mul_xmm(const Xbyak::Xmm &xmm_weight, const Xbyak::Xmm &xmm_aux) {
-        uni_vmovshdup(xmm_aux, xmm_weight);              //  weight:1,2,3,4; aux:2,2,4,4
-        uni_vmulps(xmm_weight, xmm_weight, xmm_aux);     //  weight:1*2,2*2,3*4,4*4
-        uni_vmovhlps(xmm_aux, xmm_aux, xmm_weight);      //  aux:3*4,4*4,4,4
-        uni_vmulps(xmm_weight, xmm_weight, xmm_aux);     //  weight:1*2*3*4,...
-    }
-
-    // horizontal mul for vmm_weight(Vmm(3)), temp1 and temp2 as aux
-    inline void horizontal_mul() {
-        Xbyak::Xmm xmm_weight = Xbyak::Xmm(vmm_temp3.getIdx());
-        Xbyak::Xmm xmm_temp1 = Xbyak::Xmm(vmm_temp1.getIdx());
-        Xbyak::Xmm xmm_temp2 = Xbyak::Xmm(vmm_temp2.getIdx());
-        if (isa == cpu::x64::sse41) {
-            horizontal_mul_xmm(xmm_weight, xmm_temp1);
-        } else if (isa == cpu::x64::avx2) {
-            Xbyak::Ymm ymm_weight = Xbyak::Ymm(vmm_temp3.getIdx());
-            vextractf128(xmm_temp1, ymm_weight, 0);
-            vextractf128(xmm_temp2, ymm_weight, 1);
-            uni_vmulps(xmm_weight, xmm_temp1, xmm_temp2);
-            horizontal_mul_xmm(xmm_weight, xmm_temp1);
-        } else {
-            Xbyak::Zmm zmm_weight = Xbyak::Zmm(vmm_temp3.getIdx());
-            vextractf32x4(xmm_temp1, zmm_weight, 0);
-            vextractf32x4(xmm_temp2, zmm_weight, 1);
-            uni_vmulps(xmm_temp1, xmm_temp1, xmm_temp2);
-            vextractf32x4(xmm_temp2, zmm_weight, 2);
-            vextractf32x4(xmm_weight, zmm_weight, 3);
-            uni_vmulps(xmm_weight, xmm_weight, xmm_temp2);
-            uni_vmulps(xmm_weight, xmm_weight, xmm_temp1);
-            horizontal_mul_xmm(xmm_weight, xmm_temp1);
-        }
-    }
-
-    inline void prepare_table() {
-        auto broadcast_d = [&](int val) {
-            for (size_t d = 0; d < vlen / sizeof(int); ++d) {
-                dd(val);
-            }
-        };
-
-        align(64);
-        L(l_table_constant);
-        broadcast_d(0x3f000000);   // 0.5f
-        dw(0x0001);
-    }
-};
-#endif
-
-bool NonMaxSuppression::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
+bool NonMaxSuppression::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
     try {
-        using NonMaxSuppressionV9 = ngraph::op::v9::NonMaxSuppression;
-        if (!one_of(op->get_type_info(), NonMaxSuppressionV9::get_type_info_static(),
-                    ov::op::internal::NonMaxSuppressionIEInternal::get_type_info_static())) {
-            errorMessage = "Only NonMaxSuppression v9 and NonMaxSuppressionIEInternal are supported";
+        if (!one_of(op->get_type_info(), op::v9::NonMaxSuppression::get_type_info_static(),
+                                         op::internal::NonMaxSuppressionIEInternal::get_type_info_static(),
+                                         op::v13::NMSRotated::get_type_info_static())) {
+            errorMessage = "Only NonMaxSuppression from opset9, NonMaxSuppressionIEInternal and NMSRotated from opset13 are supported.";
             return false;
         }
 
-        if (const auto nms9 = std::dynamic_pointer_cast<const NonMaxSuppressionV9>(op)) {
+        if (auto nms9 = as_type<const op::v9::NonMaxSuppression>(op.get())) {
             const auto boxEncoding = nms9->get_box_encoding();
-            if (!one_of(boxEncoding, NonMaxSuppressionV9::BoxEncodingType::CENTER, NonMaxSuppressionV9::BoxEncodingType::CORNER)) {
+            if (!one_of(boxEncoding, op::v9::NonMaxSuppression::BoxEncodingType::CENTER, op::v9::NonMaxSuppression::BoxEncodingType::CORNER)) {
                 errorMessage = "Supports only CENTER and CORNER box encoding type";
                 return false;
             }
@@ -576,107 +46,125 @@ bool NonMaxSuppression::isSupportedOperation(const std::shared_ptr<const ngraph:
     return true;
 }
 
-NonMaxSuppression::NonMaxSuppression(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context)
-    : Node(op, context, InternalDynShapeInferFactory()),
-      isSoftSuppressedByIOU(false) {
+NonMaxSuppression::NonMaxSuppression(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr& context)
+        : Node(op, context, InternalDynShapeInferFactory()),
+          m_is_soft_suppressed_by_iou(false) {
     std::string errorMessage;
     if (!isSupportedOperation(op, errorMessage)) {
-        IE_THROW(NotImplemented) << errorMessage;
+        OPENVINO_THROW(errorMessage);
     }
 
-    errorPrefix = "NMS layer with name '" + op->get_friendly_name() + "' ";
-    if (one_of(op->get_type_info(), ov::op::internal::NonMaxSuppressionIEInternal::get_type_info_static()))
-        m_outStaticShape = true;
-
-    if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > 6)
-        IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getOriginalInputsNumber();
+    if (one_of(op->get_type_info(), op::internal::NonMaxSuppressionIEInternal::get_type_info_static())) {
+        m_out_static_shape = true;
+    }
 
-    if (getOriginalOutputsNumber() != 3)
-        IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getOriginalOutputsNumber();
+    if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > NMS_SOFT_NMS_SIGMA + 1) {
+        THROW_CPU_NODE_ERR("has incorrect number of input edges: ", getOriginalInputsNumber());
+    }
+    if (getOriginalOutputsNumber() != 3) {
+        THROW_CPU_NODE_ERR("has incorrect number of output edges: ", getOriginalOutputsNumber());
+    }
 
-    if (const auto nms9 = std::dynamic_pointer_cast<const ngraph::op::v9::NonMaxSuppression>(op)) {
+    if (auto nms9 = as_type<const op::v9::NonMaxSuppression>(op.get())) {
         boxEncodingType = static_cast<NMSBoxEncodeType>(nms9->get_box_encoding());
-        sortResultDescending = nms9->get_sort_result_descending();
-        } else if (const auto nmsIe = std::dynamic_pointer_cast<const ov::op::internal::NonMaxSuppressionIEInternal>(op)) {
-            boxEncodingType = nmsIe->m_center_point_box ? NMSBoxEncodeType::CENTER : NMSBoxEncodeType::CORNER;
-            sortResultDescending = nmsIe->m_sort_result_descending;
-        } else {
-            const auto &typeInfo = op->get_type_info();
-            IE_THROW() << errorPrefix << " doesn't support NMS: " << typeInfo.name << " v" << typeInfo.version_id;
-        }
+        m_sort_result_descending = nms9->get_sort_result_descending();
+        m_coord_num = 4lu;
+    } else if (auto nmsIe = as_type<const op::internal::NonMaxSuppressionIEInternal>(op.get())) {
+        boxEncodingType = nmsIe->m_center_point_box ? NMSBoxEncodeType::CENTER : NMSBoxEncodeType::CORNER;
+        m_sort_result_descending = nmsIe->m_sort_result_descending;
+        m_coord_num = 4lu;
+    } else if (auto nms = as_type<const op::v13::NMSRotated>(op.get())) {
+        m_sort_result_descending = nms->get_sort_result_descending();
+        m_clockwise = nms->get_clockwise();
+        m_rotated_boxes = true;
+        m_coord_num = 5lu;
+    } else {
+        const auto &typeInfo = op->get_type_info();
+        THROW_CPU_NODE_ERR("doesn't support NMS: ", typeInfo.name, " v", typeInfo.version_id);
+    }
+
+    const auto &boxes_dims = getInputShapeAtPort(NMS_BOXES).getDims();
+    if (boxes_dims.size() != 3) {
+        THROW_CPU_NODE_ERR("has unsupported 'boxes' input rank: ", boxes_dims.size());
+    }
+    if (boxes_dims[2] != m_coord_num) {
+        THROW_CPU_NODE_ERR("has unsupported 'boxes' input 3rd dimension size: ", boxes_dims[2]);
+    }
+
+    const auto &scores_dims = getInputShapeAtPort(NMS_SCORES).getDims();
+    if (scores_dims.size() != 3) {
+        THROW_CPU_NODE_ERR("has unsupported 'scores' input rank: ", scores_dims.size());
+    }
+
+    const auto& valid_outputs_shape = getOutputShapeAtPort(NMS_VALID_OUTPUTS);
+    if (valid_outputs_shape.getRank() != 1) {
+        THROW_CPU_NODE_ERR("has unsupported 'valid_outputs' output rank: ", valid_outputs_shape.getRank());
+    }
+    if (valid_outputs_shape.getDims()[0] != 1) {
+        THROW_CPU_NODE_ERR("has unsupported 'valid_outputs' output 1st dimension size: ", valid_outputs_shape.getDims()[1]);
+    }
 
-        const auto &boxes_dims = getInputShapeAtPort(NMS_BOXES).getDims();
-        if (boxes_dims.size() != 3)
-            IE_THROW() << errorPrefix << "has unsupported 'boxes' input rank: " << boxes_dims.size();
-        if (boxes_dims[2] != 4)
-            IE_THROW() << errorPrefix << "has unsupported 'boxes' input 3rd dimension size: " << boxes_dims[2];
-
-        const auto &scores_dims = getInputShapeAtPort(NMS_SCORES).getDims();
-        if (scores_dims.size() != 3)
-            IE_THROW() << errorPrefix << "has unsupported 'scores' input rank: " << scores_dims.size();
-
-        const Shape valid_outputs_shape = getOutputShapeAtPort(NMS_VALIDOUTPUTS);
-        if (valid_outputs_shape.getRank() != 1)
-            IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output rank: " << valid_outputs_shape.getRank();
-        if (valid_outputs_shape.getDims()[0] != 1)
-            IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output 1st dimension size: " << valid_outputs_shape.getDims()[1];
+    for (size_t i = 0lu; i < op->get_output_size(); i++) {
+        m_defined_outputs[i] = !op->get_output_target_inputs(i).empty();
+    }
 }
 
 void NonMaxSuppression::initSupportedPrimitiveDescriptors() {
     if (!supportedPrimitiveDescriptors.empty())
         return;
 
-    const std::vector<Precision> supportedFloatPrecision = {Precision::FP32, Precision::BF16, Precision::FP16};
-    const std::vector<Precision> supportedIntOutputPrecision = {Precision::I32, Precision::I64};
-
-    checkPrecision(getOriginalInputPrecisionAtPort(NMS_BOXES), supportedFloatPrecision, "boxes", inType);
-    checkPrecision(getOriginalInputPrecisionAtPort(NMS_SCORES), supportedFloatPrecision, "scores", inType);
-    checkPrecision(getOriginalOutputPrecisionAtPort(NMS_VALIDOUTPUTS), supportedIntOutputPrecision, "valid_outputs", outType);
-
-    const std::vector<Precision> supportedPrecision = {Precision::I16, Precision::U8, Precision::I8, Precision::U16, Precision::I32,
-                                                       Precision::U32, Precision::I64, Precision::U64};
-
-    if (inputShapes.size() > NMS_MAXOUTPUTBOXESPERCLASS)
-        check1DInput(getInputShapeAtPort(NMS_MAXOUTPUTBOXESPERCLASS), supportedPrecision, "max_output_boxes_per_class", NMS_MAXOUTPUTBOXESPERCLASS);
-    if (inputShapes.size() > NMS_IOUTHRESHOLD)
-        check1DInput(getInputShapeAtPort(NMS_IOUTHRESHOLD), supportedFloatPrecision, "iou_threshold", NMS_IOUTHRESHOLD);
-    if (inputShapes.size() > NMS_SCORETHRESHOLD)
-        check1DInput(getInputShapeAtPort(NMS_SCORETHRESHOLD), supportedFloatPrecision, "score_threshold", NMS_SCORETHRESHOLD);
-    if (inputShapes.size() > NMS_SOFTNMSSIGMA)
-        check1DInput(getInputShapeAtPort(NMS_SCORETHRESHOLD), supportedFloatPrecision, "soft_nms_sigma", NMS_SCORETHRESHOLD);
+    const auto inputs_num = inputShapes.size();
+    if (inputs_num > NMS_MAX_OUTPUT_BOXES_PER_CLASS) {
+        check1DInput(getInputShapeAtPort(NMS_MAX_OUTPUT_BOXES_PER_CLASS), "max_output_boxes_per_class", NMS_MAX_OUTPUT_BOXES_PER_CLASS);
+    }
+    if (inputs_num > NMS_IOU_THRESHOLD) {
+        check1DInput(getInputShapeAtPort(NMS_IOU_THRESHOLD), "iou_threshold", NMS_IOU_THRESHOLD);
+    }
+    if (inputs_num > NMS_SCORE_THRESHOLD) {
+        check1DInput(getInputShapeAtPort(NMS_SCORE_THRESHOLD), "score_threshold", NMS_SCORE_THRESHOLD);
+    }
+    if (inputs_num > NMS_SOFT_NMS_SIGMA) {
+        check1DInput(getInputShapeAtPort(NMS_SCORE_THRESHOLD), "soft_nms_sigma", NMS_SCORE_THRESHOLD);
+    }
 
-    checkOutput(getOutputShapeAtPort(NMS_SELECTEDINDICES), supportedIntOutputPrecision, "selected_indices", NMS_SELECTEDINDICES);
-    checkOutput(getOutputShapeAtPort(NMS_SELECTEDSCORES), supportedFloatPrecision, "selected_scores", NMS_SELECTEDSCORES);
+    checkOutput(getOutputShapeAtPort(NMS_SELECTED_INDICES), "selected_indices", NMS_SELECTED_INDICES);
+    checkOutput(getOutputShapeAtPort(NMS_SELECTED_SCORES), "selected_scores", NMS_SELECTED_SCORES);
 
     std::vector<PortConfigurator> inDataConf;
-    inDataConf.reserve(inputShapes.size());
-    for (size_t i = 0; i < inputShapes.size(); ++i) {
-        Precision inPrecision = i == NMS_MAXOUTPUTBOXESPERCLASS ? Precision::I32 : Precision::FP32;
+    inDataConf.reserve(inputs_num);
+    for (size_t i = 0; i < inputs_num; ++i) {
+        Precision inPrecision = i == NMS_MAX_OUTPUT_BOXES_PER_CLASS ? Precision::I32 : Precision::FP32;
         inDataConf.emplace_back(LayoutType::ncsp, inPrecision);
     }
 
     std::vector<PortConfigurator> outDataConf;
     outDataConf.reserve(outputShapes.size());
     for (size_t i = 0; i < outputShapes.size(); ++i) {
-        Precision outPrecision = i == NMS_SELECTEDSCORES ? Precision::FP32 : Precision::I32;
+        Precision outPrecision = i == NMS_SELECTED_SCORES ? Precision::FP32 : Precision::I32;
         outDataConf.emplace_back(LayoutType::ncsp, outPrecision);
     }
 
-    impl_desc_type impl_type;
-    if (mayiuse(cpu::x64::avx512_core)) {
-        impl_type = impl_desc_type::jit_avx512;
-    } else if (mayiuse(cpu::x64::avx2)) {
-        impl_type = impl_desc_type::jit_avx2;
-    } else if (mayiuse(cpu::x64::sse41)) {
-        impl_type = impl_desc_type::jit_sse42;
-    } else {
-        impl_type = impl_desc_type::ref;
-    }
+    impl_desc_type impl_type = impl_desc_type::ref;
 
-    addSupportedPrimDesc(inDataConf, outDataConf, impl_type);
+#if defined(OPENVINO_ARCH_X86_64)
+    using namespace dnnl::impl::cpu;
 
-    // as only FP32 and ncsp is supported, and kernel is shape agnostic, we can create here. There is no need to recompilation.
+    // As only FP32 and ncsp is supported, and kernel is shape agnostic, we can create here. There is no need to recompilation.
     createJitKernel();
+
+    x64::cpu_isa_t actual_isa = x64::isa_undef;
+    if (m_jit_kernel) {
+        actual_isa = m_jit_kernel->getIsa();
+    }
+    switch (actual_isa) {
+        case x64::avx512_core: impl_type = impl_desc_type::jit_avx512; break;
+        case x64::avx2:        impl_type = impl_desc_type::jit_avx2;   break;
+        case x64::sse41:       impl_type = impl_desc_type::jit_sse42;  break;
+        default:               impl_type = impl_desc_type::ref;
+    }
+#endif // OPENVINO_ARCH_X86_64
+
+    addSupportedPrimDesc(inDataConf, outDataConf, impl_type);
 }
 
 void NonMaxSuppression::prepareParams() {
@@ -685,193 +173,170 @@ void NonMaxSuppression::prepareParams() {
     const auto& scoresDims = isDynamicNode() ? getParentEdgesAtPort(NMS_SCORES)[0]->getMemory().getStaticDims() :
                                                 getInputShapeAtPort(NMS_SCORES).getStaticDims();
 
-    numBatches = boxesDims[0];
-    numBoxes = boxesDims[1];
-    numClasses = scoresDims[1];
-    if (numBatches != scoresDims[0])
-        IE_THROW() << errorPrefix << " numBatches is different in 'boxes' and 'scores' inputs";
-    if (numBoxes != scoresDims[2])
-        IE_THROW() << errorPrefix << " numBoxes is different in 'boxes' and 'scores' inputs";
-
-    numFiltBox.resize(numBatches);
-    for (auto & i : numFiltBox)
-        i.resize(numClasses);
-}
+    m_batches_num = boxesDims[0];
+    m_boxes_num = boxesDims[1];
+    m_classes_num = scoresDims[1];
+    if (m_batches_num != scoresDims[0]) {
+        THROW_CPU_NODE_ERR("Batches number is different in 'boxes' and 'scores' inputs");
+    }
+    if (m_boxes_num != scoresDims[2]) {
+        THROW_CPU_NODE_ERR("Boxes number is different in 'boxes' and 'scores' inputs");
+    }
 
-bool NonMaxSuppression::isExecutable() const {
-    return isDynamicNode() || Node::isExecutable();
+    m_output_boxes_per_class = std::min(m_max_output_boxes_per_class, m_boxes_num);
+    const auto max_number_of_boxes = m_output_boxes_per_class * m_batches_num * m_classes_num;
+    m_filtered_boxes.resize(max_number_of_boxes);
+
+    m_num_filtered_boxes.resize(m_batches_num);
+    for (auto & i : m_num_filtered_boxes) {
+        i.resize(m_classes_num);
+    }
 }
 
 void NonMaxSuppression::createJitKernel() {
 #if defined(OPENVINO_ARCH_X86_64)
-    auto jcp = jit_nms_config_params();
-    jcp.box_encode_type = boxEncodingType;
-    jcp.is_soft_suppressed_by_iou = isSoftSuppressedByIOU;
-
-    if (mayiuse(cpu::x64::avx512_core)) {
-        nms_kernel.reset(new jit_uni_nms_kernel_f32<cpu::x64::avx512_core>(jcp));
-    } else if (mayiuse(cpu::x64::avx2)) {
-        nms_kernel.reset(new jit_uni_nms_kernel_f32<cpu::x64::avx2>(jcp));
-    } else if (mayiuse(cpu::x64::sse41)) {
-        nms_kernel.reset(new jit_uni_nms_kernel_f32<cpu::x64::sse41>(jcp));
-    }
+    if (!m_rotated_boxes) {
+        auto jcp = kernel::NmsCompileParams();
+        jcp.box_encode_type = boxEncodingType;
+        jcp.is_soft_suppressed_by_iou = m_is_soft_suppressed_by_iou;
 
-    if (nms_kernel)
-        nms_kernel->create_ker();
-#endif
+        m_jit_kernel = kernel::JitKernel<kernel::NmsCompileParams, kernel::NmsCallArgs>::createInstance<kernel::NonMaxSuppression>(jcp);
+    }
+#endif // OPENVINO_ARCH_X86_64
 }
 
 void NonMaxSuppression::executeDynamicImpl(dnnl::stream strm) {
-    if (hasEmptyInputTensors() || (inputShapes.size() > NMS_MAXOUTPUTBOXESPERCLASS &&
-            reinterpret_cast<int *>(getParentEdgeAt(NMS_MAXOUTPUTBOXESPERCLASS)->getMemoryPtr()->getData())[0] == 0)) {
+    if (hasEmptyInputTensors() || (inputShapes.size() > NMS_MAX_OUTPUT_BOXES_PER_CLASS &&
+            reinterpret_cast<int *>(getParentEdgeAt(NMS_MAX_OUTPUT_BOXES_PER_CLASS)->getMemoryPtr()->getData())[0] == 0)) {
         redefineOutputMemory({{0, 3}, {0, 3}, {1}});
-        *reinterpret_cast<int *>(getChildEdgesAtPort(NMS_VALIDOUTPUTS)[0]->getMemoryPtr()->getData()) = 0;
+        *reinterpret_cast<int *>(getChildEdgesAtPort(NMS_VALID_OUTPUTS)[0]->getMemoryPtr()->getData()) = 0;
         return;
     }
     execute(strm);
 }
 
 void NonMaxSuppression::execute(dnnl::stream strm) {
-    const float *boxes = reinterpret_cast<const float *>(getParentEdgeAt(NMS_BOXES)->getMemoryPtr()->getData());
-    const float *scores = reinterpret_cast<const float *>(getParentEdgeAt(NMS_SCORES)->getMemoryPtr()->getData());
-
-    if (inputShapes.size() > NMS_MAXOUTPUTBOXESPERCLASS) {
-        maxOutputBoxesPerClass = reinterpret_cast<int *>(getParentEdgeAt(NMS_MAXOUTPUTBOXESPERCLASS)->getMemoryPtr()->getData())[0];
+    const auto inputs_num = inputShapes.size();
+
+    size_t max_number_of_boxes = m_output_boxes_per_class * m_batches_num * m_classes_num;
+    if (inputs_num > NMS_MAX_OUTPUT_BOXES_PER_CLASS) {
+        auto val = reinterpret_cast<int32_t *>(getParentEdgeAt(NMS_MAX_OUTPUT_BOXES_PER_CLASS)->getMemoryPtr()->getData())[0];
+        m_max_output_boxes_per_class = val <= 0l ? 0lu : static_cast<size_t>(val);
+        m_output_boxes_per_class = std::min(m_max_output_boxes_per_class, m_boxes_num);
+        max_number_of_boxes = m_output_boxes_per_class * m_batches_num * m_classes_num;
+        m_filtered_boxes.resize(max_number_of_boxes);
     }
-
-    maxOutputBoxesPerClass = std::min(maxOutputBoxesPerClass, numBoxes);
-
-    if (maxOutputBoxesPerClass == 0) {
+    if (m_max_output_boxes_per_class == 0lu) {
         return;
     }
 
-    if (inputShapes.size() > NMS_IOUTHRESHOLD)
-        iouThreshold = reinterpret_cast<float *>(getParentEdgeAt(NMS_IOUTHRESHOLD)->getMemoryPtr()->getData())[0];
-
-    if (inputShapes.size() > NMS_SCORETHRESHOLD)
-        scoreThreshold = reinterpret_cast<float *>(getParentEdgeAt(NMS_SCORETHRESHOLD)->getMemoryPtr()->getData())[0];
-
-    if (inputShapes.size() > NMS_SOFTNMSSIGMA)
-        softNMSSigma = reinterpret_cast<float *>(getParentEdgeAt(NMS_SOFTNMSSIGMA)->getMemoryPtr()->getData())[0];
-    scale = 0.0f;
-    if (softNMSSigma > 0.0) {
-        scale = -0.5f / softNMSSigma;
+    if (inputs_num > NMS_IOU_THRESHOLD) {
+        m_iou_threshold = reinterpret_cast<float *>(getParentEdgeAt(NMS_IOU_THRESHOLD)->getMemoryPtr()->getData())[0];
+    }
+    if (inputs_num > NMS_SCORE_THRESHOLD) {
+        m_score_threshold = reinterpret_cast<float *>(getParentEdgeAt(NMS_SCORE_THRESHOLD)->getMemoryPtr()->getData())[0];
     }
+    if (inputs_num > NMS_SOFT_NMS_SIGMA) {
+        m_soft_nms_sigma = reinterpret_cast<float *>(getParentEdgeAt(NMS_SOFT_NMS_SIGMA)->getMemoryPtr()->getData())[0];
+        m_scale = (m_soft_nms_sigma > 0.f) ? (-0.5f / m_soft_nms_sigma) : 0.f;
+    }
+
+    auto boxes_memory = getParentEdgeAt(NMS_BOXES)->getMemoryPtr();
+    auto scores_memory = getParentEdgeAt(NMS_SCORES)->getMemoryPtr();
 
-    auto boxesStrides = getParentEdgeAt(NMS_BOXES)->getMemory().getDescWithType<BlockedMemoryDesc>()->getStrides();
-    auto scoresStrides = getParentEdgeAt(NMS_SCORES)->getMemory().getDescWithType<BlockedMemoryDesc>()->getStrides();
+    auto boxes = reinterpret_cast<const float *>(boxes_memory->getData());
+    auto scores = reinterpret_cast<const float *>(scores_memory->getData());
 
-    const auto maxNumberOfBoxes = maxOutputBoxesPerClass * numBatches * numClasses;
-    std::vector<filteredBoxes> filtBoxes(maxNumberOfBoxes);
+    const auto& boxes_strides = boxes_memory->getDescWithType<BlockedMemoryDesc>()->getStrides();
+    const auto& scores_strides = scores_memory->getDescWithType<BlockedMemoryDesc>()->getStrides();
 
-    if (softNMSSigma == 0.0f) {
-        nmsWithoutSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes);
+    if (m_rotated_boxes) {
+        nmsRotated(boxes, scores, boxes_strides, scores_strides, m_filtered_boxes);
+    } else if (m_soft_nms_sigma == 0.f) {
+        nmsWithoutSoftSigma(boxes, scores, boxes_strides, scores_strides, m_filtered_boxes);
     } else {
-        nmsWithSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes);
+        nmsWithSoftSigma(boxes, scores, boxes_strides, scores_strides, m_filtered_boxes);
     }
 
-    size_t startOffset = numFiltBox[0][0];
-    for (size_t b = 0; b < numFiltBox.size(); b++) {
-        size_t batchOffset = b*numClasses*maxOutputBoxesPerClass;
-        for (size_t c = (b == 0 ? 1 : 0); c < numFiltBox[b].size(); c++) {
-            size_t offset = batchOffset + c*maxOutputBoxesPerClass;
-            for (size_t i = 0; i < numFiltBox[b][c]; i++) {
-                filtBoxes[startOffset + i] = filtBoxes[offset + i];
+    size_t start_offset = m_num_filtered_boxes[0][0];
+    for (size_t b = 0lu; b < m_num_filtered_boxes.size(); b++) {
+        size_t batchOffset = b * m_classes_num * m_output_boxes_per_class;
+        for (size_t c = (b == 0lu ? 1lu : 0lu); c < m_num_filtered_boxes[b].size(); c++) {
+            size_t offset = batchOffset + c * m_output_boxes_per_class;
+            for (size_t i = 0lu; i < m_num_filtered_boxes[b][c]; i++) {
+                m_filtered_boxes[start_offset + i] = m_filtered_boxes[offset + i];
             }
-            startOffset += numFiltBox[b][c];
+            start_offset += m_num_filtered_boxes[b][c];
         }
     }
-    filtBoxes.resize(startOffset);
 
+    auto boxes_ptr = m_filtered_boxes.data();
     // need more particular comparator to get deterministic behaviour
     // escape situation when filtred boxes with same score have different position from launch to launch
-    if (sortResultDescending) {
-        parallel_sort(filtBoxes.begin(), filtBoxes.end(),
-                      [](const filteredBoxes& l, const filteredBoxes& r) {
+    if (m_sort_result_descending) {
+        parallel_sort(boxes_ptr, boxes_ptr + start_offset,
+                      [](const FilteredBox& l, const FilteredBox& r) {
                           return (l.score > r.score) ||
-                                 (l.score ==  r.score && l.batch_index < r.batch_index) ||
-                                 (l.score ==  r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) ||
-                                 (l.score ==  r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index);
+                                 (l.score == r.score && l.batch_index < r.batch_index) ||
+                                 (l.score == r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) ||
+                                 (l.score == r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index);
                       });
     }
 
-    auto indicesMemPtr = getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getMemoryPtr();
-    auto scoresMemPtr =  getChildEdgesAtPort(NMS_SELECTEDSCORES)[0]->getMemoryPtr();
-    const size_t validOutputs = std::min(filtBoxes.size(), maxNumberOfBoxes);
+    const size_t valid_outputs = std::min(start_offset, max_number_of_boxes);
 
-    if (!m_outStaticShape) {
-        VectorDims newDims{validOutputs, 3};
-        redefineOutputMemory({newDims, newDims, {1}});
-    }
+    if (m_defined_outputs[NMS_SELECTED_INDICES]) {
+        const size_t stride = 3lu;
 
-    int selectedIndicesStride = indicesMemPtr->getDescWithType<BlockedMemoryDesc>()->getStrides()[0];
+        if (!m_out_static_shape) {
+            redefineOutputMemory(NMS_SELECTED_INDICES, { valid_outputs, stride });
+        }
 
-    int *selectedIndicesPtr = reinterpret_cast<int *>(indicesMemPtr->getData());
-    float *selectedScoresPtr = reinterpret_cast<float *>(scoresMemPtr->getData());
+        auto out_ptr = reinterpret_cast<int32_t *>(getChildEdgesAtPort(NMS_SELECTED_INDICES)[0]->getMemoryPtr()->getData());
+        int32_t* boxes_ptr = &(m_filtered_boxes[0].batch_index);
 
-    size_t idx = 0lu;
-    for (; idx < validOutputs; idx++) {
-        selectedIndicesPtr[0] = filtBoxes[idx].batch_index;
-        selectedIndicesPtr[1] = filtBoxes[idx].class_index;
-        selectedIndicesPtr[2] = filtBoxes[idx].box_index;
-        selectedIndicesPtr += selectedIndicesStride;
+        size_t idx = 0lu;
+        for (; idx < valid_outputs; idx++) {
+            memcpy(out_ptr, boxes_ptr, 12);
+            out_ptr += stride;
+            boxes_ptr += 4;
+        }
 
-        selectedScoresPtr[0] = static_cast<float>(filtBoxes[idx].batch_index);
-        selectedScoresPtr[1] = static_cast<float>(filtBoxes[idx].class_index);
-        selectedScoresPtr[2] = static_cast<float>(filtBoxes[idx].score);
-        selectedScoresPtr += selectedIndicesStride;
+        if (m_out_static_shape) {
+            std::fill(out_ptr, out_ptr + (max_number_of_boxes - idx) * stride, -1);
+        }
     }
 
-    if (m_outStaticShape) {
-        std::fill(selectedIndicesPtr, selectedIndicesPtr + (maxNumberOfBoxes - idx) * selectedIndicesStride, -1);
-        std::fill(selectedScoresPtr, selectedScoresPtr + (maxNumberOfBoxes - idx) * selectedIndicesStride, -1.f);
-    }
+    if (m_defined_outputs[NMS_SELECTED_SCORES]) {
+        const size_t stride = 3lu;
 
-    int *valid_outputs = reinterpret_cast<int *>(getChildEdgesAtPort(NMS_VALIDOUTPUTS)[0]->getMemoryPtr()->getData());
-    *valid_outputs = static_cast<int>(validOutputs);
-}
+        if (!m_out_static_shape) {
+            redefineOutputMemory(NMS_SELECTED_SCORES, { valid_outputs, stride });
+        }
 
-bool NonMaxSuppression::created() const {
-    return getType() == Type::NonMaxSuppression;
-}
+        auto out_ptr = reinterpret_cast<float *>(getChildEdgesAtPort(NMS_SELECTED_SCORES)[0]->getMemoryPtr()->getData());
 
-float NonMaxSuppression::intersectionOverUnion(const float *boxesI, const float *boxesJ) {
-    float yminI, xminI, ymaxI, xmaxI, yminJ, xminJ, ymaxJ, xmaxJ;
-    if (boxEncodingType == NMSBoxEncodeType::CENTER) {
-        //  box format: x_center, y_center, width, height
-        yminI = boxesI[1] - boxesI[3] / 2.f;
-        xminI = boxesI[0] - boxesI[2] / 2.f;
-        ymaxI = boxesI[1] + boxesI[3] / 2.f;
-        xmaxI = boxesI[0] + boxesI[2] / 2.f;
-        yminJ = boxesJ[1] - boxesJ[3] / 2.f;
-        xminJ = boxesJ[0] - boxesJ[2] / 2.f;
-        ymaxJ = boxesJ[1] + boxesJ[3] / 2.f;
-        xmaxJ = boxesJ[0] + boxesJ[2] / 2.f;
-    } else {
-        //  box format: y1, x1, y2, x2
-        yminI = (std::min)(boxesI[0], boxesI[2]);
-        xminI = (std::min)(boxesI[1], boxesI[3]);
-        ymaxI = (std::max)(boxesI[0], boxesI[2]);
-        xmaxI = (std::max)(boxesI[1], boxesI[3]);
-        yminJ = (std::min)(boxesJ[0], boxesJ[2]);
-        xminJ = (std::min)(boxesJ[1], boxesJ[3]);
-        ymaxJ = (std::max)(boxesJ[0], boxesJ[2]);
-        xmaxJ = (std::max)(boxesJ[1], boxesJ[3]);
-    }
+        size_t idx = 0lu;
+        for (; idx < valid_outputs; idx++) {
+            out_ptr[0] = static_cast<float>(m_filtered_boxes[idx].batch_index);
+            out_ptr[1] = static_cast<float>(m_filtered_boxes[idx].class_index);
+            out_ptr[2] = m_filtered_boxes[idx].score;
+            out_ptr += stride;
+        }
 
-    float areaI = (ymaxI - yminI) * (xmaxI - xminI);
-    float areaJ = (ymaxJ - yminJ) * (xmaxJ - xminJ);
-    if (areaI <= 0.f || areaJ <= 0.f)
-        return 0.f;
+        if (m_out_static_shape) {
+            std::fill(out_ptr, out_ptr + (max_number_of_boxes - idx) * stride, -1.f);
+        }
+    }
 
-    float intersection_area =
-            (std::max)((std::min)(ymaxI, ymaxJ) - (std::max)(yminI, yminJ), 0.f) *
-            (std::max)((std::min)(xmaxI, xmaxJ) - (std::max)(xminI, xminJ), 0.f);
-    return intersection_area / (areaI + areaJ - intersection_area);
+    if (m_defined_outputs[NMS_VALID_OUTPUTS]) {
+        auto out_ptr = reinterpret_cast<int32_t *>(getChildEdgesAtPort(NMS_VALID_OUTPUTS)[0]->getMemoryPtr()->getData());
+        *out_ptr = static_cast<int32_t>(valid_outputs);
+    }
 }
 
 void NonMaxSuppression::nmsWithSoftSigma(const float *boxes, const float *scores, const VectorDims &boxesStrides,
-                                                             const VectorDims &scoresStrides, std::vector<filteredBoxes> &filtBoxes) {
+                                                             const VectorDims &scoresStrides, std::vector<FilteredBox> &filtBoxes) {
     auto less = [](const boxInfo& l, const boxInfo& r) {
         return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx));
     };
@@ -880,23 +345,23 @@ void NonMaxSuppression::nmsWithSoftSigma(const float *boxes, const float *scores
     // if is_soft_suppressed_by_iou is false, apply for all iou, including iou>iou_threshold, soft suppressed when score < score_threshold
     // if is_soft_suppressed_by_iou is true, hard suppressed by iou_threshold, then soft suppress
     auto coeff = [&](float iou) {
-        if (isSoftSuppressedByIOU && iou > iouThreshold)
+        if (m_is_soft_suppressed_by_iou && iou > m_iou_threshold)
             return 0.0f;
-        return std::exp(scale * iou * iou);
+        return std::exp(m_scale * iou * iou);
     };
 
-    parallel_for2d(numBatches, numClasses, [&](int batch_idx, int class_idx) {
-        std::vector<filteredBoxes> selectedBoxes;
+    parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) {
+        std::vector<FilteredBox> selectedBoxes;
         const float *boxesPtr = boxes + batch_idx * boxesStrides[0];
         const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
 
         std::priority_queue<boxInfo, std::vector<boxInfo>, decltype(less)> sorted_boxes(less);  // score, box_id, suppress_begin_index
-        for (int box_idx = 0; box_idx < static_cast<int>(numBoxes); box_idx++) {
-            if (scoresPtr[box_idx] > scoreThreshold)
+        for (int box_idx = 0; box_idx < static_cast<int>(m_boxes_num); box_idx++) {
+            if (scoresPtr[box_idx] > m_score_threshold)
                 sorted_boxes.emplace(boxInfo({scoresPtr[box_idx], box_idx, 0}));
         }
-        size_t sortedBoxSize = sorted_boxes.size();
-        size_t maxSeletedBoxNum = std::min(sortedBoxSize, maxOutputBoxesPerClass);
+        size_t sorted_boxes_size = sorted_boxes.size();
+        size_t maxSeletedBoxNum = std::min(sorted_boxes_size, m_output_boxes_per_class);
         selectedBoxes.reserve(maxSeletedBoxNum);
         if (maxSeletedBoxNum > 0) {
             // include first directly
@@ -904,22 +369,23 @@ void NonMaxSuppression::nmsWithSoftSigma(const float *boxes, const float *scores
             sorted_boxes.pop();
             selectedBoxes.push_back({ candidateBox.score, batch_idx, class_idx, candidateBox.idx });
             if (maxSeletedBoxNum > 1) {
-                if (nms_kernel) {
+                if (m_jit_kernel) {
+#if defined(OPENVINO_ARCH_X86_64)
                     std::vector<float> boxCoord0(maxSeletedBoxNum, 0.0f);
                     std::vector<float> boxCoord1(maxSeletedBoxNum, 0.0f);
                     std::vector<float> boxCoord2(maxSeletedBoxNum, 0.0f);
                     std::vector<float> boxCoord3(maxSeletedBoxNum, 0.0f);
 
-                    boxCoord0[0] = boxesPtr[candidateBox.idx * 4];
-                    boxCoord1[0] = boxesPtr[candidateBox.idx * 4 + 1];
-                    boxCoord2[0] = boxesPtr[candidateBox.idx * 4 + 2];
-                    boxCoord3[0] = boxesPtr[candidateBox.idx * 4 + 3];
+                    boxCoord0[0] = boxesPtr[candidateBox.idx * m_coord_num];
+                    boxCoord1[0] = boxesPtr[candidateBox.idx * m_coord_num + 1];
+                    boxCoord2[0] = boxesPtr[candidateBox.idx * m_coord_num + 2];
+                    boxCoord3[0] = boxesPtr[candidateBox.idx * m_coord_num + 3];
 
-                    auto arg = jit_nms_args();
-                    arg.iou_threshold = static_cast<float*>(&iouThreshold);
-                    arg.score_threshold = static_cast<float*>(&scoreThreshold);
-                    arg.scale = static_cast<float*>(&scale);
-                    while (selectedBoxes.size() < maxOutputBoxesPerClass && !sorted_boxes.empty()) {
+                    auto arg = kernel::NmsCallArgs();
+                    arg.iou_threshold = static_cast<float*>(&m_iou_threshold);
+                    arg.score_threshold = static_cast<float*>(&m_score_threshold);
+                    arg.scale = static_cast<float*>(&m_scale);
+                    while (selectedBoxes.size() < m_output_boxes_per_class && !sorted_boxes.empty()) {
                         boxInfo candidateBox = sorted_boxes.top();
                         float origScore = candidateBox.score;
                         sorted_boxes.pop();
@@ -931,9 +397,9 @@ void NonMaxSuppression::nmsWithSoftSigma(const float *boxes, const float *scores
                         arg.selected_boxes_coord[1] = static_cast<float*>(&boxCoord1[candidateBox.suppress_begin_index]);
                         arg.selected_boxes_coord[2] = static_cast<float*>(&boxCoord2[candidateBox.suppress_begin_index]);
                         arg.selected_boxes_coord[3] = static_cast<float*>(&boxCoord3[candidateBox.suppress_begin_index]);
-                        arg.candidate_box = static_cast<const float*>(&boxesPtr[candidateBox.idx * 4]);
+                        arg.candidate_box = static_cast<const float*>(&boxesPtr[candidateBox.idx * m_coord_num]);
                         arg.candidate_status = static_cast<int*>(&candidateStatus);
-                        (*nms_kernel)(&arg);
+                        (*m_jit_kernel)(&arg);
 
                         if (candidateStatus == NMSCandidateStatus::SUPPRESSED) {
                             continue;
@@ -941,30 +407,32 @@ void NonMaxSuppression::nmsWithSoftSigma(const float *boxes, const float *scores
                             if (candidateBox.score == origScore) {
                                 selectedBoxes.push_back({ candidateBox.score, batch_idx, class_idx, candidateBox.idx });
                                 int selectedSize = selectedBoxes.size();
-                                boxCoord0[selectedSize - 1] = boxesPtr[candidateBox.idx * 4];
-                                boxCoord1[selectedSize - 1] = boxesPtr[candidateBox.idx * 4 + 1];
-                                boxCoord2[selectedSize - 1] = boxesPtr[candidateBox.idx * 4 + 2];
-                                boxCoord3[selectedSize - 1] = boxesPtr[candidateBox.idx * 4 + 3];
+                                boxCoord0[selectedSize - 1] = boxesPtr[candidateBox.idx * m_coord_num];
+                                boxCoord1[selectedSize - 1] = boxesPtr[candidateBox.idx * m_coord_num + 1];
+                                boxCoord2[selectedSize - 1] = boxesPtr[candidateBox.idx * m_coord_num + 2];
+                                boxCoord3[selectedSize - 1] = boxesPtr[candidateBox.idx * m_coord_num + 3];
                             } else {
                                 candidateBox.suppress_begin_index = selectedBoxes.size();
                                 sorted_boxes.push(candidateBox);
                             }
                         }
                     }
+#endif // OPENVINO_ARCH_X86_64
                 } else {
-                    while (selectedBoxes.size() < maxOutputBoxesPerClass && !sorted_boxes.empty()) {
+                    while (selectedBoxes.size() < m_output_boxes_per_class && !sorted_boxes.empty()) {
                         boxInfo candidateBox = sorted_boxes.top();
                         float origScore = candidateBox.score;
                         sorted_boxes.pop();
 
                         int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected, 2 for updated
                         for (int selected_idx = static_cast<int>(selectedBoxes.size()) - 1; selected_idx >= candidateBox.suppress_begin_index; selected_idx--) {
-                            float iou = intersectionOverUnion(&boxesPtr[candidateBox.idx * 4], &boxesPtr[selectedBoxes[selected_idx].box_index * 4]);
+                            float iou = intersectionOverUnion(&boxesPtr[candidateBox.idx * m_coord_num],
+                                                              &boxesPtr[selectedBoxes[selected_idx].box_index * m_coord_num]);
 
                             // when is_soft_suppressed_by_iou is true, score is decayed to zero and implicitely suppressed if iou > iou_threshold.
                             candidateBox.score *= coeff(iou);
                             // soft suppressed
-                            if (candidateBox.score <= scoreThreshold) {
+                            if (candidateBox.score <= m_score_threshold) {
                                 candidateStatus = NMSCandidateStatus::SUPPRESSED;
                                 break;
                             }
@@ -984,8 +452,8 @@ void NonMaxSuppression::nmsWithSoftSigma(const float *boxes, const float *scores
                 }
             }
         }
-        numFiltBox[batch_idx][class_idx] = selectedBoxes.size();
-        size_t offset = batch_idx*numClasses*maxOutputBoxesPerClass + class_idx*maxOutputBoxesPerClass;
+        m_num_filtered_boxes[batch_idx][class_idx] = selectedBoxes.size();
+        size_t offset = batch_idx * m_classes_num * m_output_boxes_per_class + class_idx * m_output_boxes_per_class;
         for (size_t i = 0; i < selectedBoxes.size(); i++) {
             filtBoxes[offset + i] = selectedBoxes[i];
         }
@@ -993,44 +461,47 @@ void NonMaxSuppression::nmsWithSoftSigma(const float *boxes, const float *scores
 }
 
 void NonMaxSuppression::nmsWithoutSoftSigma(const float *boxes, const float *scores, const VectorDims &boxesStrides,
-                                                                const VectorDims &scoresStrides, std::vector<filteredBoxes> &filtBoxes) {
-    int max_out_box = static_cast<int>(maxOutputBoxesPerClass);
-    parallel_for2d(numBatches, numClasses, [&](int batch_idx, int class_idx) {
+                                                                const VectorDims &scoresStrides, std::vector<FilteredBox> &filtBoxes) {
+    int max_out_box = static_cast<int>(m_output_boxes_per_class);
+    parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) {
         const float *boxesPtr = boxes + batch_idx * boxesStrides[0];
         const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
 
         std::vector<std::pair<float, int>> sorted_boxes;  // score, box_idx
-        for (size_t box_idx = 0; box_idx < numBoxes; box_idx++) {
-            if (scoresPtr[box_idx] > scoreThreshold)
+        sorted_boxes.reserve(m_boxes_num);
+        for (size_t box_idx = 0; box_idx < m_boxes_num; box_idx++) {
+            if (scoresPtr[box_idx] > m_score_threshold) {
                 sorted_boxes.emplace_back(std::make_pair(scoresPtr[box_idx], box_idx));
+            }
         }
 
         int io_selection_size = 0;
-        size_t sortedBoxSize = sorted_boxes.size();
-        if (sortedBoxSize > 0) {
+        const size_t sortedBoxSize = sorted_boxes.size();
+        if (sortedBoxSize > 0lu) {
             parallel_sort(sorted_boxes.begin(), sorted_boxes.end(),
                           [](const std::pair<float, int>& l, const std::pair<float, int>& r) {
                               return (l.first > r.first || ((l.first == r.first) && (l.second < r.second)));
                           });
-            int offset = batch_idx*numClasses*maxOutputBoxesPerClass + class_idx*maxOutputBoxesPerClass;
-            filtBoxes[offset + 0] = filteredBoxes(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second);
+            int offset = batch_idx * m_classes_num * m_output_boxes_per_class + class_idx * m_output_boxes_per_class;
+            filtBoxes[offset + 0] = FilteredBox(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second);
             io_selection_size++;
-            if (sortedBoxSize > 1) {
-                if (nms_kernel) {
+            if (sortedBoxSize > 1lu) {
+                if (m_jit_kernel) {
+#if defined(OPENVINO_ARCH_X86_64)
                     std::vector<float> boxCoord0(sortedBoxSize, 0.0f);
                     std::vector<float> boxCoord1(sortedBoxSize, 0.0f);
                     std::vector<float> boxCoord2(sortedBoxSize, 0.0f);
                     std::vector<float> boxCoord3(sortedBoxSize, 0.0f);
 
-                    boxCoord0[0] = boxesPtr[sorted_boxes[0].second * 4];
-                    boxCoord1[0] = boxesPtr[sorted_boxes[0].second * 4 + 1];
-                    boxCoord2[0] = boxesPtr[sorted_boxes[0].second * 4 + 2];
-                    boxCoord3[0] = boxesPtr[sorted_boxes[0].second * 4 + 3];
+                    boxCoord0[0] = boxesPtr[sorted_boxes[0].second * m_coord_num];
+                    boxCoord1[0] = boxesPtr[sorted_boxes[0].second * m_coord_num + 1];
+                    boxCoord2[0] = boxesPtr[sorted_boxes[0].second * m_coord_num + 2];
+                    boxCoord3[0] = boxesPtr[sorted_boxes[0].second * m_coord_num + 3];
 
-                    auto arg = jit_nms_args();
-                    arg.iou_threshold = static_cast<float*>(&iouThreshold);
-                    arg.score_threshold = static_cast<float*>(&scoreThreshold);
-                    arg.scale = static_cast<float*>(&scale);
+                    auto arg = kernel::NmsCallArgs();
+                    arg.iou_threshold = static_cast<float*>(&m_iou_threshold);
+                    arg.score_threshold = static_cast<float*>(&m_score_threshold);
+                    arg.scale = static_cast<float*>(&m_scale);
                     // box start index do not change for hard supresion
                     arg.selected_boxes_coord[0] = static_cast<float*>(&boxCoord0[0]);
                     arg.selected_boxes_coord[1] = static_cast<float*>(&boxCoord1[0]);
@@ -1040,26 +511,27 @@ void NonMaxSuppression::nmsWithoutSoftSigma(const float *boxes, const float *sco
                     for (size_t candidate_idx = 1; (candidate_idx < sortedBoxSize) && (io_selection_size < max_out_box); candidate_idx++) {
                         int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected
                         arg.selected_boxes_num = io_selection_size;
-                        arg.candidate_box = static_cast<const float*>(&boxesPtr[sorted_boxes[candidate_idx].second * 4]);
+                        arg.candidate_box = static_cast<const float*>(&boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num]);
                         arg.candidate_status = static_cast<int*>(&candidateStatus);
-                        (*nms_kernel)(&arg);
+                        (*m_jit_kernel)(&arg);
                         if (candidateStatus == NMSCandidateStatus::SELECTED) {
-                            boxCoord0[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * 4];
-                            boxCoord1[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * 4 + 1];
-                            boxCoord2[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * 4 + 2];
-                            boxCoord3[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * 4 + 3];
+                            boxCoord0[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num];
+                            boxCoord1[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num + 1];
+                            boxCoord2[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num + 2];
+                            boxCoord3[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num + 3];
                             filtBoxes[offset + io_selection_size] =
-                                filteredBoxes(sorted_boxes[candidate_idx].first, batch_idx, class_idx, sorted_boxes[candidate_idx].second);
+                                FilteredBox(sorted_boxes[candidate_idx].first, batch_idx, class_idx, sorted_boxes[candidate_idx].second);
                             io_selection_size++;
                         }
                     }
+#endif // OPENVINO_ARCH_X86_64
                 } else {
                     for (size_t candidate_idx = 1; (candidate_idx < sortedBoxSize) && (io_selection_size < max_out_box); candidate_idx++) {
                         int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected
                         for (int selected_idx = io_selection_size - 1; selected_idx >= 0; selected_idx--) {
-                            float iou = intersectionOverUnion(&boxesPtr[sorted_boxes[candidate_idx].second * 4],
-                                &boxesPtr[filtBoxes[offset + selected_idx].box_index * 4]);
-                            if (iou >= iouThreshold) {
+                            float iou = intersectionOverUnion(&boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num],
+                                    &boxesPtr[filtBoxes[offset + selected_idx].box_index * m_coord_num]);
+                            if (iou >= m_iou_threshold) {
                                 candidateStatus = NMSCandidateStatus::SUPPRESSED;
                                 break;
                             }
@@ -1067,7 +539,7 @@ void NonMaxSuppression::nmsWithoutSoftSigma(const float *boxes, const float *sco
 
                         if (candidateStatus == NMSCandidateStatus::SELECTED) {
                             filtBoxes[offset + io_selection_size] =
-                                filteredBoxes(sorted_boxes[candidate_idx].first, batch_idx, class_idx, sorted_boxes[candidate_idx].second);
+                                FilteredBox(sorted_boxes[candidate_idx].first, batch_idx, class_idx, sorted_boxes[candidate_idx].second);
                             io_selection_size++;
                         }
                     }
@@ -1075,35 +547,372 @@ void NonMaxSuppression::nmsWithoutSoftSigma(const float *boxes, const float *sco
             }
         }
 
-        numFiltBox[batch_idx][class_idx] = io_selection_size;
+        m_num_filtered_boxes[batch_idx][class_idx] = io_selection_size;
     });
 }
 
-void NonMaxSuppression::checkPrecision(const Precision& prec, const std::vector<Precision>& precList,
-                                                           const std::string& name, const std::string& type) {
-    if (std::find(precList.begin(), precList.end(), prec) == precList.end())
-        IE_THROW() << errorPrefix << "has unsupported '" << name << "' " << type << " precision: " << prec;
+////////// Rotated boxes //////////
+
+struct RotatedBox {
+    float x_ctr, y_ctr, w, h, a;
+};
+
+inline float dot_2d(const NonMaxSuppression::Point2D& A, const NonMaxSuppression::Point2D& B) {
+    return A.x * B.x + A.y * B.y;
 }
 
-void NonMaxSuppression::check1DInput(const Shape& shape, const std::vector<Precision>& precList,
-                                                         const std::string& name, const size_t port) {
-    checkPrecision(getOriginalInputPrecisionAtPort(port), precList, name, inType);
+inline float cross_2d(const NonMaxSuppression::Point2D& A, const NonMaxSuppression::Point2D& B) {
+    return A.x * B.y - B.x * A.y;
+}
+
+inline void getRotatedVertices(const float* box, NonMaxSuppression::Point2D (&pts)[4], bool clockwise) {
+    auto theta = clockwise ? box[4] : -box[4];
+
+    auto cos_theta = std::cos(theta) * 0.5f;
+    auto sin_theta = std::sin(theta) * 0.5f;
+
+    // y: top --> down; x: left --> right
+    // Left-Down
+    pts[0].x = box[0] - sin_theta * box[3] - cos_theta * box[2];
+    pts[0].y = box[1] + cos_theta * box[3] - sin_theta * box[2];
+    // Left-Top
+    pts[1].x = box[0] + sin_theta * box[3] - cos_theta * box[2];
+    pts[1].y = box[1] - cos_theta * box[3] - sin_theta * box[2];
+    // Right-Top
+    pts[2].x = 2 * box[0] - pts[0].x;
+    pts[2].y = 2 * box[1] - pts[0].y;
+    // Right-Down
+    pts[3].x = 2 * box[0] - pts[1].x;
+    pts[3].y = 2 * box[1] - pts[1].y;
+}
+
+inline float polygonArea(const NonMaxSuppression::Point2D (&q)[24], const int64_t& m) {
+    if (m <= 2l) {
+        return 0.f;
+    }
+
+    float area = 0.f;
+    size_t mlu = static_cast<size_t>(m - 1l);
+    for (size_t i = 1lu; i < mlu; i++) {
+        area += std::abs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
+    }
+
+    return area / 2.f;
+}
+
+inline size_t convexHullGraham(const NonMaxSuppression::Point2D (&p)[24],
+                               const size_t num_in,
+                               NonMaxSuppression::Point2D (&q)[24]) {
+    OPENVINO_ASSERT(num_in >= 2lu);
+
+    // Step 1:
+    // Find point with minimum y
+    // if more than 1 points have the same minimum y,
+    // pick the one with the minimum x.
+    size_t t = 0lu;
+    for (size_t i = 1lu; i < num_in; i++) {
+        if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+            t = i;
+        }
+    }
+    auto& start = p[t];  // starting point
+
+    // Step 2:
+    // Subtract starting point from every points (for sorting in the next step)
+    for (size_t i = 0lu; i < num_in; i++) {
+        q[i] = p[i] - start;
+    }
+
+    // Swap the starting point to position 0
+    std::swap(q[t], q[0]);
+
+    // Step 3:
+    // Sort point 1 ~ num_in according to their relative cross-product values
+    // (essentially sorting according to angles)
+    // If the angles are the same, sort according to their distance to origin
+    float dist[24];
+    for (size_t i = 0lu; i < num_in; i++) {
+        dist[i] = dot_2d(q[i], q[i]);
+    }
+
+    std::sort(q + 1, q + num_in, [](const NonMaxSuppression::Point2D& A, const NonMaxSuppression::Point2D& B) -> bool {
+        float temp = cross_2d(A, B);
+        if (std::abs(temp) < 1e-6f) {
+            return dot_2d(A, A) < dot_2d(B, B);
+        } else {
+            return temp > 0.f;
+        }
+    });
+    // compute distance to origin after sort, since the points are now different.
+    for (size_t i = 0lu; i < num_in; i++) {
+        dist[i] = dot_2d(q[i], q[i]);
+    }
+
+    // Step 4:
+    // Make sure there are at least 2 points (that don't overlap with each other)
+    // in the stack
+    size_t k = 1lu;  // index of the non-overlapped second point
+    for (; k < num_in; k++) {
+        if (dist[k] > 1e-8f) {
+            break;
+        }
+    }
+    if (k == num_in) {
+        // We reach the end, which means the convex hull is just one point
+        q[0] = p[t];
+        return 1lu;
+    }
+    q[1] = q[k];
+    size_t m = 2lu;  // 2 points in the stack
+    // Step 5:
+    // Finally we can start the scanning process.
+    // When a non-convex relationship between the 3 points is found
+    // (either concave shape or duplicated points),
+    // we pop the previous point from the stack
+    // until the 3-point relationship is convex again, or
+    // until the stack only contains two points
+    for (size_t i = k + 1lu; i < num_in; i++) {
+        while (m > 1lu && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+            m--;
+        }
+        q[m++] = q[i];
+    }
+
+    return m;
+}
+
+inline size_t getIntersectionPoints(const NonMaxSuppression::Point2D (&pts1)[4],
+                                    const NonMaxSuppression::Point2D (&pts2)[4],
+                                    NonMaxSuppression::Point2D (&intersections)[24]) {
+    // Line vector
+    // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+    NonMaxSuppression::Point2D vec1[4], vec2[4];
+    for (size_t i = 0lu; i < 4lu; i++) {
+        vec1[i] = pts1[(i + 1lu) % 4lu] - pts1[i];
+        vec2[i] = pts2[(i + 1lu) % 4lu] - pts2[i];
+    }
+
+    // Line test - test all line combos for intersection
+    size_t num = 0lu;  // number of intersections
+    for (size_t i = 0lu; i < 4lu; i++) {
+        for (size_t j = 0lu; j < 4lu; j++) {
+            // Solve for 2x2 Ax=b
+            float det = cross_2d(vec2[j], vec1[i]);
+
+            // This takes care of parallel lines
+            if (std::abs(det) <= 1e-14f) {
+                continue;
+            }
+
+            auto vec12 = pts2[j] - pts1[i];
+
+            auto t1 = cross_2d(vec2[j], vec12) / det;
+            auto t2 = cross_2d(vec1[i], vec12) / det;
+
+            if (t1 >= 0.f && t1 <= 1.f && t2 >= 0.f && t2 <= 1.f) {
+                intersections[num++] = pts1[i] + vec1[i] * t1;
+            }
+        }
+    }
+
+    // Check for vertices of rect1 inside rect2
+    {
+        const auto& AB = vec2[0];
+        const auto& DA = vec2[3];
+        auto ABdotAB = dot_2d(AB, AB);
+        auto ADdotAD = dot_2d(DA, DA);
+        for (size_t i = 0lu; i < 4lu; i++) {
+            // Assume ABCD is the rectangle, and P is the point to be judged
+            // P is inside ABCD if P's projection on AB lies within AB
+            // and P's projection on AD lies within AD
+
+            auto AP = pts1[i] - pts2[0];
 
+            auto APdotAB = dot_2d(AP, AB);
+            auto APdotAD = -dot_2d(AP, DA);
+
+            if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
+                intersections[num++] = pts1[i];
+            }
+        }
+    }
+
+    // Reverse the check - check for vertices of rect2 inside rect1
+    {
+        const auto& AB = vec1[0];
+        const auto& DA = vec1[3];
+        auto ABdotAB = dot_2d(AB, AB);
+        auto ADdotAD = dot_2d(DA, DA);
+        for (size_t i = 0lu; i < 4lu; i++) {
+            auto AP = pts2[i] - pts1[0];
+
+            auto APdotAB = dot_2d(AP, AB);
+            auto APdotAD = -dot_2d(AP, DA);
+
+            if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
+                intersections[num++] = pts2[i];
+            }
+        }
+    }
+
+    return num;
+}
+
+inline float rotatedBoxesIntersection(const NonMaxSuppression::Point2D (&vertices_0)[4], const float* box_1, const bool clockwise) {
+    // There are up to 4 x 4 + 4 + 4 = 24 intersections (including duplicates) returned
+    NonMaxSuppression::Point2D intersect_pts[24], ordered_pts[24];
+
+    NonMaxSuppression::Point2D vertices_1[4];
+    getRotatedVertices(box_1, vertices_1, clockwise);
+
+    auto num = getIntersectionPoints(vertices_0, vertices_1, intersect_pts);
+
+    if (num <= 2lu) {
+        return 0.f;
+    }
+
+    auto num_convex = convexHullGraham(intersect_pts, num, ordered_pts);
+    return polygonArea(ordered_pts, num_convex);
+}
+
+inline float NonMaxSuppression::rotatedIntersectionOverUnion(const NonMaxSuppression::Point2D (&vertices_0)[4], const float area_0, const float* box_1) {
+    const auto area_1 = box_1[2] * box_1[3]; // W x H
+    if (area_1 <= 0.f) {
+        return 0.f;
+    }
+
+    const auto intersection = rotatedBoxesIntersection(vertices_0, box_1, m_clockwise);
+
+    return intersection / (area_0 + area_1 - intersection);
+}
+
+void NonMaxSuppression::nmsRotated(const float* boxes, const float* scores, const VectorDims& boxes_strides,
+                                   const VectorDims& scores_strides, std::vector<FilteredBox>& filtered_boxes) {
+    if (m_jit_kernel) {
+        THROW_CPU_NODE_ERR("does not have implementation of the JIT kernel for Rotated boxes.");
+    } else {
+        parallel_for2d(m_batches_num, m_classes_num, [&](int64_t batch_idx, int64_t class_idx) {
+            const float *boxes_ptr = boxes + batch_idx * boxes_strides[0];
+            const float *scores_ptr = scores + batch_idx * scores_strides[0] + class_idx * scores_strides[1];
+
+            std::vector<std::pair<float, size_t>> sorted_indices;  // score, box_idx
+            sorted_indices.reserve(m_boxes_num);
+            for (size_t box_idx = 0lu; box_idx < m_boxes_num; box_idx++, scores_ptr++) {
+                if (*scores_ptr > m_score_threshold) {
+                    sorted_indices.emplace_back(std::make_pair(*scores_ptr, box_idx));
+                }
+            }
+
+            size_t io_selection_size = 0lu;
+            const size_t sorted_boxes_size = sorted_indices.size();
+
+            if (sorted_boxes_size > 0lu) {
+                parallel_sort(sorted_indices.begin(), sorted_indices.end(),
+                            [](const std::pair<float, size_t>& l, const std::pair<float, size_t>& r) {
+                                return (l.first > r.first || ((l.first == r.first) && (l.second < r.second)));
+                            });
+                auto sorted_indices_ptr = sorted_indices.data();
+                auto filtered_boxes_ptr = filtered_boxes.data()
+                        + batch_idx * m_classes_num * m_output_boxes_per_class + class_idx * m_output_boxes_per_class;
+                *filtered_boxes_ptr = FilteredBox(sorted_indices[0].first, batch_idx, class_idx, sorted_indices[0].second);
+                io_selection_size++;
+                if (sorted_boxes_size > 1lu) {
+                    sorted_indices_ptr++;
+                    NMSCandidateStatus candidate_status;
+
+                    for (size_t candidate_idx = 1lu; (candidate_idx < sorted_boxes_size) && (io_selection_size < m_output_boxes_per_class);
+                            candidate_idx++, sorted_indices_ptr++) {
+                        candidate_status = NMSCandidateStatus::SELECTED;
+                        auto box_0 = boxes_ptr + (*sorted_indices_ptr).second * m_coord_num;
+                        const auto area_0 = box_0[2] * box_0[3]; // W x H
+
+                        if (area_0 > 0.f) {
+                            NonMaxSuppression::Point2D vertices_0[4];
+                            getRotatedVertices(box_0, vertices_0, m_clockwise);
+                            auto trg_boxes = reinterpret_cast<int32_t *>(&((*filtered_boxes_ptr).box_index));
+                            for (size_t selected_idx = 0lu; selected_idx < io_selection_size; selected_idx++, trg_boxes -= 4) {
+                                auto iou = rotatedIntersectionOverUnion(vertices_0, area_0, boxes_ptr + m_coord_num * (*trg_boxes));
+                                if (iou > m_iou_threshold) {
+                                    candidate_status = NMSCandidateStatus::SUPPRESSED;
+                                    break;
+                                }
+                            }
+                        } else if (0.f > m_iou_threshold) {
+                            candidate_status = NMSCandidateStatus::SUPPRESSED;
+                        }
+
+                        if (candidate_status == NMSCandidateStatus::SELECTED) {
+                            *(++filtered_boxes_ptr) =
+                                FilteredBox((*sorted_indices_ptr).first, batch_idx, class_idx, (*sorted_indices_ptr).second);
+                            io_selection_size++;
+                        }
+                    }
+                }
+            }
+
+            m_num_filtered_boxes[batch_idx][class_idx] = io_selection_size;
+        });
+    }
+}
+
+/////////////// End of Rotated boxes ///////////////
+
+float NonMaxSuppression::intersectionOverUnion(const float *boxesI, const float *boxesJ) {
+    float yminI, xminI, ymaxI, xmaxI, yminJ, xminJ, ymaxJ, xmaxJ;
+    if (boxEncodingType == NMSBoxEncodeType::CENTER) {
+        //  box format: x_center, y_center, width, height
+        yminI = boxesI[1] - boxesI[3] / 2.f;
+        xminI = boxesI[0] - boxesI[2] / 2.f;
+        ymaxI = boxesI[1] + boxesI[3] / 2.f;
+        xmaxI = boxesI[0] + boxesI[2] / 2.f;
+        yminJ = boxesJ[1] - boxesJ[3] / 2.f;
+        xminJ = boxesJ[0] - boxesJ[2] / 2.f;
+        ymaxJ = boxesJ[1] + boxesJ[3] / 2.f;
+        xmaxJ = boxesJ[0] + boxesJ[2] / 2.f;
+    } else {
+        //  box format: y1, x1, y2, x2
+        yminI = (std::min)(boxesI[0], boxesI[2]);
+        xminI = (std::min)(boxesI[1], boxesI[3]);
+        ymaxI = (std::max)(boxesI[0], boxesI[2]);
+        xmaxI = (std::max)(boxesI[1], boxesI[3]);
+        yminJ = (std::min)(boxesJ[0], boxesJ[2]);
+        xminJ = (std::min)(boxesJ[1], boxesJ[3]);
+        ymaxJ = (std::max)(boxesJ[0], boxesJ[2]);
+        xmaxJ = (std::max)(boxesJ[1], boxesJ[3]);
+    }
+
+    float areaI = (ymaxI - yminI) * (xmaxI - xminI);
+    float areaJ = (ymaxJ - yminJ) * (xmaxJ - xminJ);
+    if (areaI <= 0.f || areaJ <= 0.f)
+        return 0.f;
+
+    float intersection_area =
+            (std::max)((std::min)(ymaxI, ymaxJ) - (std::max)(yminI, yminJ), 0.f) *
+            (std::max)((std::min)(xmaxI, xmaxJ) - (std::max)(xminI, xminJ), 0.f);
+    return intersection_area / (areaI + areaJ - intersection_area);
+}
+
+void NonMaxSuppression::check1DInput(const Shape& shape, const std::string& name, const size_t port) {
     if (shape.getRank() != 0 && shape.getRank() != 1)
-        IE_THROW() << errorPrefix << "has unsupported '" << name << "' input rank: " << shape.getRank();
+        THROW_CPU_NODE_ERR("has unsupported '", name, "' input rank: ", shape.getRank());
     if (shape.getRank() == 1)
         if (shape.getDims()[0] != 1)
-            IE_THROW() << errorPrefix << "has unsupported '" << name << "' input 1st dimension size: " << MemoryDescUtils::dim2str(shape.getDims()[0]);
+            THROW_CPU_NODE_ERR("has unsupported '", name, "' input 1st dimension size: ", MemoryDescUtils::dim2str(shape.getDims()[0]));
 }
 
-void NonMaxSuppression::checkOutput(const Shape& shape, const std::vector<Precision>& precList,
-                                                        const std::string& name, const size_t port) {
-    checkPrecision(getOriginalOutputPrecisionAtPort(port), precList, name, outType);
-
+void NonMaxSuppression::checkOutput(const Shape& shape, const std::string& name, const size_t port) {
     if (shape.getRank() != 2)
-        IE_THROW() << errorPrefix << "has unsupported '" << name << "' output rank: " << shape.getRank();
+        THROW_CPU_NODE_ERR("has unsupported '", name, "' output rank: ", shape.getRank());
     if (shape.getDims()[1] != 3)
-        IE_THROW() << errorPrefix << "has unsupported '" << name << "' output 2nd dimension size: " << MemoryDescUtils::dim2str(shape.getDims()[1]);
+        THROW_CPU_NODE_ERR("has unsupported '", name, "' output 2nd dimension size: ", MemoryDescUtils::dim2str(shape.getDims()[1]));
+}
+
+bool NonMaxSuppression::isExecutable() const {
+    return isDynamicNode() || Node::isExecutable();
+}
+
+bool NonMaxSuppression::created() const {
+    return getType() == Type::NonMaxSuppression;
 }
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h
index 2599fa3843ff06..6547737ef9952c 100644
--- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h
+++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h
@@ -4,82 +4,43 @@
 
 #pragma once
 
-#include <ie_common.h>
-#include <node.h>
-#include <string>
-#include <memory>
-#include <vector>
+#include "node.h"
+#include "kernels/x64/non_max_suppression.hpp"
 
-#define BOX_COORD_NUM 4
-
-using namespace InferenceEngine;
 
 namespace ov {
 namespace intel_cpu {
 namespace node {
 
-enum class NMSBoxEncodeType {
-    CORNER,
-    CENTER
-};
-
 enum NMSCandidateStatus {
     SUPPRESSED = 0,
     SELECTED = 1,
     UPDATED = 2
 };
 
-struct jit_nms_config_params {
-    NMSBoxEncodeType box_encode_type;
-    bool is_soft_suppressed_by_iou;
-};
-
-struct jit_nms_args {
-    const void* selected_boxes_coord[BOX_COORD_NUM];
-    size_t selected_boxes_num;
-    const void* candidate_box;
-    const void* iou_threshold;
-    void* candidate_status;
-    // for soft suppression, score *= scale * iou * iou;
-    const void* score_threshold;
-    const void* scale;
-    void* score;
-};
-
-struct jit_uni_nms_kernel {
-    void (*ker_)(const jit_nms_args *);
-
-    void operator()(const jit_nms_args *args) {
-        assert(ker_);
-        ker_(args);
-    }
-
-    explicit jit_uni_nms_kernel(jit_nms_config_params jcp_) : ker_(nullptr), jcp(jcp_) {}
-    virtual ~jit_uni_nms_kernel() {}
-
-    virtual void create_ker() = 0;
-
-    jit_nms_config_params jcp;
-};
-
 class NonMaxSuppression : public Node {
 public:
-    NonMaxSuppression(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context);
+    NonMaxSuppression(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr& context);
 
     void getSupportedDescriptors() override {};
+
     void initSupportedPrimitiveDescriptors() override;
+
     void execute(dnnl::stream strm) override;
+
+    void executeDynamicImpl(dnnl::stream strm) override;
+
     bool created() const override;
 
-    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
+    static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
 
-    struct filteredBoxes {
+    struct FilteredBox {
         float score;
         int batch_index;
         int class_index;
         int box_index;
-        filteredBoxes() = default;
-        filteredBoxes(float _score, int _batch_index, int _class_index, int _box_index) :
+        FilteredBox() = default;
+        FilteredBox(float _score, int _batch_index, int _class_index, int _box_index) :
                 score(_score), batch_index(_batch_index), class_index(_class_index), box_index(_box_index) {}
     };
 
@@ -89,66 +50,101 @@ class NonMaxSuppression : public Node {
         int suppress_begin_index;
     };
 
-    float intersectionOverUnion(const float *boxesI, const float *boxesJ);
-
-    void nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides,
-                          const SizeVector &scoresStrides, std::vector<filteredBoxes> &filtBoxes);
-
-    void nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides,
-                             const SizeVector &scoresStrides, std::vector<filteredBoxes> &filtBoxes);
-
-    void executeDynamicImpl(dnnl::stream strm) override;
-
     bool isExecutable() const override;
+
     bool needShapeInfer() const override { return false; }
+
     void prepareParams() override;
 
+    struct Point2D {
+        float x, y;
+        Point2D(const float px = 0.f, const float py = 0.f) : x(px), y(py) {}
+        Point2D operator+(const Point2D& p) const {
+            return Point2D(x + p.x, y + p.y);
+        }
+        Point2D& operator+=(const Point2D& p) {
+            x += p.x;
+            y += p.y;
+            return *this;
+        }
+        Point2D operator-(const Point2D& p) const {
+            return Point2D(x - p.x, y - p.y);
+        }
+        Point2D operator*(const float coeff) const {
+            return Point2D(x * coeff, y * coeff);
+        }
+    };
+
 private:
     // input
     enum {
         NMS_BOXES,
         NMS_SCORES,
-        NMS_MAXOUTPUTBOXESPERCLASS,
-        NMS_IOUTHRESHOLD,
-        NMS_SCORETHRESHOLD,
-        NMS_SOFTNMSSIGMA,
+        NMS_MAX_OUTPUT_BOXES_PER_CLASS,
+        NMS_IOU_THRESHOLD,
+        NMS_SCORE_THRESHOLD,
+        NMS_SOFT_NMS_SIGMA,
     };
 
     // output
     enum {
-        NMS_SELECTEDINDICES,
-        NMS_SELECTEDSCORES,
-        NMS_VALIDOUTPUTS
+        NMS_SELECTED_INDICES,
+        NMS_SELECTED_SCORES,
+        NMS_VALID_OUTPUTS
     };
 
-    NMSBoxEncodeType boxEncodingType = NMSBoxEncodeType::CORNER;
-    bool sortResultDescending = true;
+    float intersectionOverUnion(const float *boxesI, const float *boxesJ);
 
-    size_t numBatches = 0;
-    size_t numBoxes = 0;
-    size_t numClasses = 0;
+    float rotatedIntersectionOverUnion(const Point2D (&vertices_0)[4], const float area_0, const float* box_1);
 
-    size_t maxOutputBoxesPerClass = 0lu;
-    float iouThreshold = 0.0f;
-    float scoreThreshold = 0.0f;
-    float softNMSSigma = 0.0f;
-    float scale = 1.f;
-    // control placeholder for NMS in new opset.
-    bool isSoftSuppressedByIOU = false;
+    void nmsWithSoftSigma(const float *boxes, const float *scores, const InferenceEngine::SizeVector &boxesStrides,
+                const InferenceEngine::SizeVector &scoresStrides, std::vector<FilteredBox> &filtBoxes);
 
-    bool m_outStaticShape = false;
+    void nmsWithoutSoftSigma(const float *boxes, const float *scores, const InferenceEngine::SizeVector &boxesStrides,
+                const InferenceEngine::SizeVector &scoresStrides, std::vector<FilteredBox> &filtBoxes);
 
-    std::string errorPrefix;
+    void nmsRotated(const float *boxes, const float *scores, const InferenceEngine::SizeVector &boxesStrides,
+                const InferenceEngine::SizeVector &scoresStrides, std::vector<FilteredBox> &filtBoxes);
 
-    std::vector<std::vector<size_t>> numFiltBox;
-    const std::string inType = "input", outType = "output";
+    void check1DInput(const Shape& shape,
+                      const std::string& name,
+                      const size_t port);
 
-    void checkPrecision(const Precision& prec, const std::vector<Precision>& precList, const std::string& name, const std::string& type);
-    void check1DInput(const Shape& shape, const std::vector<Precision>& precList, const std::string& name, const size_t port);
-    void checkOutput(const Shape& shape, const std::vector<Precision>& precList, const std::string& name, const size_t port);
+    void checkOutput(const Shape& shape,
+                     const std::string& name,
+                     const size_t port);
 
     void createJitKernel();
-    std::shared_ptr<jit_uni_nms_kernel> nms_kernel = nullptr;
+
+
+    NMSBoxEncodeType boxEncodingType = NMSBoxEncodeType::CORNER;
+    bool m_sort_result_descending = true;
+    bool m_clockwise = false;
+    bool m_rotated_boxes = false;
+    size_t m_coord_num = 1lu;
+
+    size_t m_batches_num = 0lu;
+    size_t m_boxes_num   = 0lu;
+    size_t m_classes_num = 0lu;
+
+    size_t m_max_output_boxes_per_class = 0lu; // Original value of input NMS_MAX_OUTPUT_BOXES_PER_CLASS
+    size_t m_output_boxes_per_class     = 0lu; // Actual number of output boxes
+    float m_iou_threshold   = 0.f;
+    float m_score_threshold = 0.f;
+    float m_soft_nms_sigma  = 0.f;
+    float m_scale = 0.f;
+    // control placeholder for NMS in new opset.
+    bool m_is_soft_suppressed_by_iou = false;
+
+    bool m_out_static_shape = false;
+
+    std::vector<std::vector<size_t>> m_num_filtered_boxes;
+    const std::string inType = "input";
+    const std::string outType = "output";
+    bool m_defined_outputs[NMS_VALID_OUTPUTS + 1] = { false, false, false };
+    std::vector<FilteredBox> m_filtered_boxes;
+
+    std::shared_ptr<kernel::JitKernelBase> m_jit_kernel;
 };
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index 21483175aed169..274d23ce23b527 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -197,6 +197,8 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*RDFTLayerTest.*SignalSize=().*)",
         // Issue: 123815 (Tests are sensintive to available thread count on testing machines)
         R"(.*smoke_Snippets_MHA_.?D_SplitDimensionM.*)",
+        // Issue: 122356
+        R"(.*NmsRotatedOpTest.*(SortDesc=True|Clockwise=False).*)",
     };
 
 #if defined(OPENVINO_ARCH_X86)
diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/nms_rotated.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/nms_rotated.cpp
new file mode 100644
index 00000000000000..7888a88a60221d
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/nms_rotated.cpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_op_tests/nms_rotated.hpp"
+
+using namespace LayerTestsDefinitions;
+using namespace ov::test;
+
+
+static const std::vector<std::vector<InputShape>> input_shapes = {
+    {
+        { {}, {{1, 5, 5}} },
+        { {}, {{1, 7, 5}} }
+    },
+    {
+        { {}, {{2, 9, 5}} },
+        { {}, {{2, 15, 9}} }
+    },
+    {
+        { {}, {{5, 17, 5}} },
+        { {}, {{5, 7, 17}} }
+    },
+    {
+        { {}, {{9, 75, 5}} },
+        { {}, {{9, 55, 75}} }
+    },
+    {
+        { {-1, -1,  5}, {{5, 20, 5},  {3, 50,  5},  {2, 99,  5}} },
+        { {-1, -1, -1}, {{5, 30, 20}, {3, 100, 50}, {2, 133, 99}} }
+    }
+};
+
+static const std::vector<std::vector<InputShape>> input_shapes_nightly = {
+    {
+        { {}, {{3, 11, 5}} },
+        { {}, {{3, 15, 11}} }
+    },
+    {
+        { {}, {{15, 29, 5}} },
+        { {}, {{15, 31, 29}} }
+    },
+    {
+        { {}, {{21, 64, 5}} },
+        { {}, {{21, 32, 64}} }
+    },
+    {
+        { {-1, -1,  5}, {{7, 35, 5},  {7, 35,  5},  {7, 35,  5}} },
+        { {-1, -1, -1}, {{7, 30, 35}, {7, 100, 35}, {7, 133, 35}} }
+    }
+};
+
+const ov::AnyMap empty_plugin_config{};
+
+INSTANTIATE_TEST_SUITE_P(smoke_, NmsRotatedOpTest,
+        ::testing::Combine(
+                ::testing::ValuesIn(input_shapes),          // Input shapes
+                ::testing::Values(ElementType::f32),        // Boxes and scores input precisions
+                ::testing::Values(ElementType::i32),        // Max output boxes input precisions
+                ::testing::Values(ElementType::f32),        // Thresholds precisions
+                ::testing::Values(ElementType::i32),        // Output type
+                ::testing::Values(5, 20),                   // Max output boxes per class
+                ::testing::Values(0.3f, 0.7f),              // IOU threshold
+                ::testing::Values(0.3f, 0.7f),              // Score threshold
+                ::testing::Values(true, false),             // Sort result descending
+                ::testing::Values(true, false),             // Clockwise
+                ::testing::Values(false),                   // Is 1st input constant
+                ::testing::Values(false),                   // Is 2nd input constant
+                ::testing::Values(false),                   // Is 3rd input constant
+                ::testing::Values(false),                   // Is 4th input constant
+                ::testing::Values(false),                   // Is 5th input constant
+                ::testing::Values(empty_plugin_config),     // Additional plugin configuration
+                ::testing::Values(utils::DEVICE_CPU)),      // Device name
+        NmsRotatedOpTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(nightly_, NmsRotatedOpTest,
+        ::testing::Combine(
+                ::testing::ValuesIn(input_shapes_nightly),
+                ::testing::Values(ElementType::f16, ElementType::bf16),
+                ::testing::Values(ElementType::i64),
+                ::testing::Values(ElementType::f16, ElementType::bf16),
+                ::testing::Values(ElementType::i64),
+                ::testing::Values(10),
+                ::testing::Values(0.5f),
+                ::testing::Values(0.4f),
+                ::testing::Values(true, false),
+                ::testing::Values(true, false),
+                ::testing::Values(true, false),
+                ::testing::Values(true, false),
+                ::testing::Values(true, false),
+                ::testing::Values(true, false),
+                ::testing::Values(true, false),
+                ::testing::Values(empty_plugin_config),
+                ::testing::Values(utils::DEVICE_CPU)),
+        NmsRotatedOpTest::getTestCaseName);
diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp
index 072b481dd3c3da..a43b208ad9754f 100644
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp
@@ -43,9 +43,9 @@ using NmsParams = std::tuple<InputShapeParams,
                              int32_t,                                            // Max output boxes per class
                              ThresholdValues,                                    // IOU, Score, Soft NMS sigma
                              ngraph::helpers::InputLayerType,                    // max_output_boxes_per_class input type
-                             ngraph::op::v9::NonMaxSuppression::BoxEncodingType, // Box encoding
+                             ov::op::v9::NonMaxSuppression::BoxEncodingType,     // Box encoding
                              bool,                                               // Sort result descending
-                             ngraph::element::Type,                              // Output type
+                             ElementType,                                        // Output type
                              std::string>;                                       // Device name
 
 class NmsLayerCPUTest : public testing::WithParamInterface<NmsParams>, virtual public SubgraphBaseTest, public CPUTestsBase {
@@ -57,9 +57,9 @@ class NmsLayerCPUTest : public testing::WithParamInterface<NmsParams>, virtual p
         ngraph::helpers::InputLayerType maxOutBoxesType;
         ThresholdValues thrValues;
         float iouThr, scoreThr, softNmsSigma;
-        op::v9::NonMaxSuppression::BoxEncodingType boxEncoding;
+        ov::op::v9::NonMaxSuppression::BoxEncodingType boxEncoding;
         bool sortResDescend;
-        element::Type outType;
+        ElementType outType;
         std::string targetDevice;
         std::tie(inShapeParams, inPrecisions, maxOutBoxesPerClass, thrValues, maxOutBoxesType, boxEncoding, sortResDescend, outType, targetDevice) = obj.param;
 
@@ -115,12 +115,12 @@ class NmsLayerCPUTest : public testing::WithParamInterface<NmsParams>, virtual p
         ThresholdValues thrValues;
         ngraph::helpers::InputLayerType maxOutBoxesType;
         float iouThr, scoreThr, softNmsSigma;
-        op::v9::NonMaxSuppression::BoxEncodingType boxEncoding;
+        ov::op::v9::NonMaxSuppression::BoxEncodingType boxEncoding;
         bool sortResDescend;
-        element::Type outType;
+        ElementType outType;
         std::tie(inShapeParams, inPrecisions, maxOutBoxesPerClass, thrValues, maxOutBoxesType, boxEncoding, sortResDescend, outType,
                  targetDevice) = this->GetParam();
-        element::Type paramsPrec, maxBoxPrec, thrPrec;
+        ElementType paramsPrec, maxBoxPrec, thrPrec;
         std::tie(paramsPrec, maxBoxPrec, thrPrec) = inPrecisions;
 
         std::tie(iouThr, scoreThr, softNmsSigma) = thrValues;
@@ -156,7 +156,7 @@ class NmsLayerCPUTest : public testing::WithParamInterface<NmsParams>, virtual p
 
         if (maxOutBoxesType == ngraph::helpers::InputLayerType::PARAMETER) {
             inputDynamicShapes.push_back(ngraph::PartialShape{1});
-            params.push_back(std::make_shared<ngraph::opset1::Parameter>(element::Type_t::i32, inputDynamicShapes.back()));
+            params.push_back(std::make_shared<ngraph::opset1::Parameter>(ElementType::i32, inputDynamicShapes.back()));
             params[1]->set_friendly_name("param_3");
             maxOutBoxesPerClassNode = params.back();
         } else {
@@ -166,7 +166,7 @@ class NmsLayerCPUTest : public testing::WithParamInterface<NmsParams>, virtual p
         auto iouThrNode = builder::makeConstant(thrPrec, ngraph::Shape{}, std::vector<float>{iouThr})->output(0);
         auto scoreThrNode = builder::makeConstant(thrPrec, ngraph::Shape{}, std::vector<float>{scoreThr})->output(0);
         auto softNmsSigmaNode = builder::makeConstant(thrPrec, ngraph::Shape{}, std::vector<float>{softNmsSigma})->output(0);
-        auto nms = std::make_shared<ngraph::op::v9::NonMaxSuppression>(params[0], params[1], maxOutBoxesPerClassNode, iouThrNode, scoreThrNode,
+        auto nms = std::make_shared<ov::op::v9::NonMaxSuppression>(params[0], params[1], maxOutBoxesPerClassNode, iouThrNode, scoreThrNode,
                                                                        softNmsSigmaNode, boxEncoding, sortResDescend, outType);
 
         function = makeNgraphFunction(paramsPrec, params, nms, "NMS");
@@ -276,7 +276,7 @@ class NmsLayerCPUTest : public testing::WithParamInterface<NmsParams>, virtual p
 
             expectedList.resize(selected_indices_size);
 
-            if (indeces_iter->get_element_type() == ov::element::i32) {
+            if (indeces_iter->get_element_type() == ElementType::i32) {
                 auto selected_indices_data = indeces_iter->data<int32_t>();
 
                 for (size_t i = 0; i < selected_indices_size; i += 3) {
@@ -296,7 +296,7 @@ class NmsLayerCPUTest : public testing::WithParamInterface<NmsParams>, virtual p
                 }
             }
 
-            if (scores_iter->get_element_type() == ov::element::f32) {
+            if (scores_iter->get_element_type() == ElementType::f32) {
                 auto selected_scores_data = scores_iter->data<float>();
                 for (size_t i = 0; i < selected_scores_size; i += 3) {
                     expectedList[i/3].score = selected_scores_data[i+2];
@@ -319,7 +319,7 @@ class NmsLayerCPUTest : public testing::WithParamInterface<NmsParams>, virtual p
             size_t selected_indices_size = indeces_iter->get_size();
             const auto selected_scores_data = scores_iter->data<float>();
 
-            if (indeces_iter->get_element_type() == ov::element::i32) {
+            if (indeces_iter->get_element_type() == ElementType::i32) {
                 const auto selected_indices_data = indeces_iter->data<int32_t>();
                 for (size_t i = 0; i < selected_indices_size; i += 3) {
                     const int32_t batchId = selected_indices_data[i+0];
@@ -415,10 +415,10 @@ const std::vector<InputShapeParams> inShapeParams = {
 const std::vector<int32_t> maxOutBoxPerClass = {5, 20};
 const std::vector<float> threshold = {0.3f, 0.7f};
 const std::vector<float> sigmaThreshold = {0.0f, 0.5f};
-const std::vector<op::v9::NonMaxSuppression::BoxEncodingType> encodType = {op::v9::NonMaxSuppression::BoxEncodingType::CENTER,
-                                                                           op::v9::NonMaxSuppression::BoxEncodingType::CORNER};
+const std::vector<ov::op::v9::NonMaxSuppression::BoxEncodingType> encodType = {ov::op::v9::NonMaxSuppression::BoxEncodingType::CENTER,
+                                                                               ov::op::v9::NonMaxSuppression::BoxEncodingType::CORNER};
 const std::vector<bool> sortResDesc = {true, false};
-const std::vector<element::Type> outType = {element::i32, element::i64};
+const std::vector<ElementType> outType = {ElementType::i32, ElementType::i64};
 const std::vector<ngraph::helpers::InputLayerType> maxBoxInputTypes = {ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::CONSTANT};
 
 const auto nmsParams = ::testing::Combine(::testing::ValuesIn(inShapeParams),
diff --git a/src/tests/functional/plugin/shared/include/single_op_tests/nms_rotated.hpp b/src/tests/functional/plugin/shared/include/single_op_tests/nms_rotated.hpp
new file mode 100644
index 00000000000000..e0b83a31866381
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/single_op_tests/nms_rotated.hpp
@@ -0,0 +1,15 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/single_op/nms_rotated.hpp"
+
+namespace LayerTestsDefinitions {
+
+TEST_P(NmsRotatedOpTest, CompareWithRefs) {
+    run();
+};
+
+}  // namespace LayerTestsDefinitions
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/nms_rotated.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/nms_rotated.hpp
new file mode 100644
index 00000000000000..ec7b5a32ec3e1c
--- /dev/null
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/nms_rotated.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+namespace LayerTestsDefinitions {
+
+typedef std::tuple<
+    std::vector<ov::test::InputShape>,  // Input shapes
+    ov::test::ElementType,              // Boxes and scores input precisions
+    ov::test::ElementType,              // Max output boxes input precisions
+    ov::test::ElementType,              // Thresholds precisions
+    ov::test::ElementType,              // Output type
+    int64_t,                            // Max output boxes per class
+    float,                              // IOU threshold
+    float,                              // Score threshold
+    bool,                               // Sort result descending
+    bool,                               // Clockwise
+    bool,                               // Is 1st input constant
+    bool,                               // Is 2nd input constant
+    bool,                               // Is 3rd input constant
+    bool,                               // Is 4th input constant
+    bool,                               // Is 5th input constant
+    ov::AnyMap,                         // Additional configuration
+    std::string                         // Device name
+> NmsRotatedParams;
+
+class NmsRotatedOpTest : public testing::WithParamInterface<NmsRotatedParams>,
+                         public ov::test::SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<NmsRotatedParams>& obj);
+
+protected:
+    void SetUp() override;
+
+    void generate_inputs(const std::vector<ov::Shape>& target_shapes) override;
+
+private:
+    int64_t m_max_out_boxes_per_class;
+    float m_iou_threshold;
+    float m_score_threshold;
+};
+
+} // namespace LayerTestsDefinitions
diff --git a/src/tests/functional/shared_test_classes/src/single_op/nms_rotated.cpp b/src/tests/functional/shared_test_classes/src/single_op/nms_rotated.cpp
new file mode 100644
index 00000000000000..c6c9e210633ae2
--- /dev/null
+++ b/src/tests/functional/shared_test_classes/src/single_op/nms_rotated.cpp
@@ -0,0 +1,207 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/single_op/nms_rotated.hpp"
+#include "ov_models/builders.hpp"
+#include "common_test_utils/data_utils.hpp"
+#include "openvino/op/nms_rotated.hpp"
+
+using namespace ov::test;
+
+namespace LayerTestsDefinitions {
+
+std::string NmsRotatedOpTest::getTestCaseName(const testing::TestParamInfo<NmsRotatedParams>& obj) {
+    const auto& in_shapes = std::get<0>(obj.param);
+
+    std::ostringstream result;
+
+    result << "IS=(";
+    for (size_t i = 0lu; i < in_shapes.size(); i++) {
+        result << utils::partialShape2str({in_shapes[i].first}) << (i < in_shapes.size() - 1lu ? "_" : "");
+    }
+    result << ")_TS=";
+    for (size_t i = 0lu; i < in_shapes.front().second.size(); i++) {
+        result << "{";
+        for (size_t j = 0lu; j < in_shapes.size(); j++) {
+            result << utils::vec2str(in_shapes[j].second[i]) << (j < in_shapes.size() - 1lu ? "_" : "");
+        }
+        result << "}_";
+    }
+    result << "_BoxPrc="    << std::get<1>(obj.param);
+    result << "_MaxPrc="    << std::get<2>(obj.param);
+    result << "_ThrPrc="    << std::get<3>(obj.param);
+    result << "_OutPrc="    << std::get<4>(obj.param);
+    result << "_MaxBox="    << std::get<5>(obj.param);
+    result << "_IouThr="    << std::get<6>(obj.param);
+    result << "_ScoreThr="  << std::get<7>(obj.param);
+    result << "_SortDesc="  << utils::bool2str(std::get<8>(obj.param));
+    result << "_Clockwise=" << utils::bool2str(std::get<9>(obj.param));
+    result << "_ConstIn={"  << utils::bool2str(std::get<10>(obj.param)) << ","
+                            << utils::bool2str(std::get<11>(obj.param)) << ","
+                            << utils::bool2str(std::get<12>(obj.param)) << ","
+                            << utils::bool2str(std::get<13>(obj.param)) << ","
+                            << utils::bool2str(std::get<14>(obj.param)) << "}";
+
+    const auto& config = std::get<15>(obj.param);
+    if (!config.empty()) {
+        result << "_Config={";
+        for (const auto& conf_item : config) {
+            result << "_" << conf_item.first << "=";
+            conf_item.second.print(result);
+        }
+        result << "}";
+    }
+
+    result << "_Device=" << std::get<16>(obj.param);
+
+    return result.str();
+}
+
+void NmsRotatedOpTest::SetUp() {
+    const auto& params          = this->GetParam();
+    const auto& in_shapes       = std::get<0>(params);
+    const auto& boxes_prc       = std::get<1>(params);
+    const auto& max_boxes_prc   = std::get<2>(params);
+    const auto& thresholds_prc  = std::get<3>(params);
+    const auto& out_prc         = std::get<4>(params);
+    m_max_out_boxes_per_class   = std::get<5>(params);
+    m_iou_threshold             = std::get<6>(params);
+    m_score_threshold           = std::get<7>(params);
+    const auto& sort_descending = std::get<8>(params);
+    const auto& clockwise       = std::get<9>(params);
+    const auto& is_0_in_const   = std::get<10>(params);
+    const auto& is_1_in_const   = std::get<11>(params);
+    const auto& is_2_in_const   = std::get<12>(params);
+    const auto& is_3_in_const   = std::get<13>(params);
+    const auto& is_4_in_const   = std::get<14>(params);
+    configuration               = std::get<15>(params);
+    targetDevice                = std::get<16>(params);
+
+    std::vector<InputShape> actual_shapes;
+    ov::ParameterVector in_params;
+    std::vector<std::shared_ptr<ov::Node>> inputs;
+    const auto in_shape_1d = InputShape{{1}, {{1}}};
+
+#define CONST_CASE(P, S, H, L)                                                                                             \
+    case P:                                                                                                                \
+        inputs.push_back(ngraph::builder::makeConstant(P, S, std::vector<ov::element_type_traits<P>::value_type>{}, true,  \
+                            ov::element_type_traits<P>::value_type(H), ov::element_type_traits<P>::value_type(L)));        \
+        break;
+
+#define CREATE_INPUT(C, P, S, N, H, L)                                                                                     \
+    if (C) {                                                                                                               \
+        switch (P) {                                                                                                       \
+            CONST_CASE(ElementType::f32,  S.second[0], H, L)                                                               \
+            CONST_CASE(ElementType::f16,  S.second[0], H, L)                                                               \
+            CONST_CASE(ElementType::bf16, S.second[0], H, L)                                                               \
+            CONST_CASE(ElementType::i32,  S.second[0], H, L)                                                               \
+            CONST_CASE(ElementType::i64,  S.second[0], H, L)                                                               \
+            default: OPENVINO_THROW("NmsRotated does not support precision ", P, " for the ", N, " input.");               \
+        }                                                                                                                  \
+    } else {                                                                                                               \
+        actual_shapes.push_back(S);                                                                                        \
+        if (S.first.rank() == 0) {                                                                                         \
+            in_params.push_back(std::make_shared<ov::op::v0::Parameter>(P, S.second.front()));                             \
+        } else {                                                                                                           \
+            in_params.push_back(std::make_shared<ov::op::v0::Parameter>(P, S.first));                                      \
+        }                                                                                                                  \
+        in_params.back()->set_friendly_name(N);                                                                            \
+        inputs.push_back(in_params.back());                                                                                \
+    }
+
+    CREATE_INPUT(is_0_in_const, boxes_prc,      in_shapes[0], "Boxes", 30, 10)
+    CREATE_INPUT(is_1_in_const, boxes_prc,      in_shapes[1], "Scores", 1, 0)
+    CREATE_INPUT(is_2_in_const, max_boxes_prc,  in_shape_1d, "MaxOutputBoxesPerClass", m_max_out_boxes_per_class, m_max_out_boxes_per_class)
+    CREATE_INPUT(is_3_in_const, thresholds_prc, in_shape_1d, "IouThreshold", m_iou_threshold, m_iou_threshold)
+    CREATE_INPUT(is_4_in_const, thresholds_prc, in_shape_1d, "ScoreThreshold", m_score_threshold, m_score_threshold)
+
+#undef CONST_CASE
+#undef CREATE_INPUT
+
+    init_input_shapes(actual_shapes);
+
+    const auto nms_op = std::make_shared<ov::op::v13::NMSRotated>(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4],
+                                                                    sort_descending, out_prc, clockwise);
+    ov::ResultVector results;
+    for (size_t i = 0lu; i < nms_op->get_output_size(); i++) {
+        results.push_back(std::make_shared<ov::op::v0::Result>(nms_op->output(i)));
+    }
+
+    function = std::make_shared<ov::Model>(results, in_params, "NMSRotated");
+}
+
+template<typename TD, typename TS>
+void fill_data(TD* dst, const TS* src, size_t len) {
+    for (size_t i = 0llu; i < len; i++) {
+        dst[i] = static_cast<TD>(src[i]);
+    }
+}
+
+void NmsRotatedOpTest::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
+    inputs.clear();
+    const auto& func_inputs = function->inputs();
+
+    for (size_t i = 0llu; i < func_inputs.size(); ++i) {
+        const auto& func_input = func_inputs[i];
+        const auto& name = func_input.get_node()->get_friendly_name();
+        const auto& in_prc = func_input.get_element_type();
+        auto tensor = ov::Tensor(in_prc, targetInputStaticShapes[i]);
+
+#define FILL_DATA(P, S, L)                                                          \
+case P :                                                                            \
+fill_data(tensor.data<ov::element_type_traits<P>::value_type>(), S, L); break;
+
+#define GEN_DATA(P, R, S, K)                                                                                                               \
+case P :                                                                                                                                   \
+utils::fill_data_random(tensor.data<ov::element_type_traits<P>::value_type>(), shape_size(targetInputStaticShapes[i]), R, S, K); break;
+
+        if (name == "Boxes") {
+            switch (in_prc) {
+                GEN_DATA(ElementType::f32, 30, 20, 1)
+                GEN_DATA(ElementType::f16, 30, 20, 1)
+                GEN_DATA(ElementType::bf16, 30, 20, 1)
+                default:
+                    OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the Scores input.");
+            }
+        } else if (name == "Scores") {
+            switch (in_prc) {
+                GEN_DATA(ElementType::f32, 1, 0, 100)
+                GEN_DATA(ElementType::f16, 1, 0, 100)
+                GEN_DATA(ElementType::bf16, 1, 0, 100)
+                default:
+                    OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the Scores input.");
+            }
+        } else if (name == "MaxOutputBoxesPerClass") {
+            switch (in_prc) {
+                FILL_DATA(ElementType::i64, &m_max_out_boxes_per_class, 1)
+                FILL_DATA(ElementType::i32, &m_max_out_boxes_per_class, 1)
+                default:
+                    OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the MaxOutputBoxesPerClass input.");
+            }
+        } else if (name == "IouThreshold") {
+            switch (in_prc) {
+                FILL_DATA(ElementType::f32,  &m_iou_threshold, 1)
+                FILL_DATA(ElementType::f16,  &m_iou_threshold, 1)
+                FILL_DATA(ElementType::bf16, &m_iou_threshold, 1)
+                default:
+                    OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the IouThreshold input.");
+            }
+        } else if (name == "ScoreThreshold") {
+            switch (in_prc) {
+                FILL_DATA(ElementType::f32,  &m_score_threshold, 1)
+                FILL_DATA(ElementType::f16,  &m_score_threshold, 1)
+                FILL_DATA(ElementType::bf16, &m_score_threshold, 1)
+                default:
+                    OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the ScoreThreshold input.");
+            }
+        }
+
+#undef GEN_DATA
+#undef FILL_DATA
+
+        inputs.insert({func_input.get_node_shared_ptr(), tensor});
+    }
+}
+
+} // namespace LayerTestsDefinitions
diff --git a/src/tests/test_utils/functional_test_utils/layer_tests_summary/skip_configs/CPU/expected_failures_OP.csv b/src/tests/test_utils/functional_test_utils/layer_tests_summary/skip_configs/CPU/expected_failures_OP.csv
index fa91f28719a834..51b03e9f335714 100644
--- a/src/tests/test_utils/functional_test_utils/layer_tests_summary/skip_configs/CPU/expected_failures_OP.csv
+++ b/src/tests/test_utils/functional_test_utils/layer_tests_summary/skip_configs/CPU/expected_failures_OP.csv
@@ -1131,5 +1131,4 @@ conformance_RegionYolo/ReadIRTest.ImportExport/Op=RegionYolo.1_Type=f32_IR=Regio
 conformance_Add/ReadIRTest.ImportExport/Op=Add.1_Type=i32_IR=28f23780d4ca0d40671caf79d5cd9223ad8f6dc2fa5ade2521f3d99586eeeb7f_Device=CPU_Shape=static_Config=(),9.72615e-07
 conformance_Convolution/ReadIRTest.Inference/Op=Convolution.1_Type=f32_IR=c301804445f273eef62f41f02204711d9d6e571da28c76ab447d7d90983b0032_Device=CPU_Shape=dynamic_Config=(),0.000113281
 conformance/OpImplCheckTest.checkPluginImplementation/Function=Multinomial_opset13_Device=CPU_Config=(),1
-conformance/OpImplCheckTest.checkPluginImplementation/Function=NMSRotated_opset13_Device=CPU_Config=(),1
 conformance/OpImplCheckTest.checkPluginImplementation/Function=LSTMSequence_opset1_Device=CPU_Config=(),1

From 017eb943d5268cadc8b31c936b4fca3f43567959 Mon Sep 17 00:00:00 2001
From: Tatiana Savina <tatiana.savina@intel.com>
Date: Tue, 31 Oct 2023 14:56:33 +0100
Subject: [PATCH 7/9] [DOCS] Update PyPI links and pre-release note (#20799)

* update links and rm pre-release note

* update ov-dev
---
 docs/install_guides/pre-release-note.md  |  2 --
 docs/install_guides/pypi-openvino-dev.md | 15 ++++++---------
 docs/install_guides/pypi-openvino-rt.md  | 13 +++++--------
 3 files changed, 11 insertions(+), 19 deletions(-)
 delete mode 100644 docs/install_guides/pre-release-note.md

diff --git a/docs/install_guides/pre-release-note.md b/docs/install_guides/pre-release-note.md
deleted file mode 100644
index 678b1f20224457..00000000000000
--- a/docs/install_guides/pre-release-note.md
+++ /dev/null
@@ -1,2 +0,0 @@
-
-> **NOTE**: This version is pre-release software and has not undergone full release validation or qualification. No support is offered on pre-release software and APIs/behavior are subject to change.  It should NOT be incorporated into any production software/solution and instead should be used only for early testing and integration while awaiting a final release version of this software.
diff --git a/docs/install_guides/pypi-openvino-dev.md b/docs/install_guides/pypi-openvino-dev.md
index 08a16318b42d92..713f7917a6cf3d 100644
--- a/docs/install_guides/pypi-openvino-dev.md
+++ b/docs/install_guides/pypi-openvino-dev.md
@@ -1,9 +1,6 @@
 # OpenVINO™ Development Tools
 
-<!--- The note below is intended for master branch only for pre-release purpose. Remove it for official releases. --->
-> **NOTE**: This version is pre-release software and has not undergone full release validation or qualification. No support is offered on pre-release software and APIs/behavior are subject to change. It should NOT be incorporated into any production software/solution and instead should be used only for early testing and integration while awaiting a final release version of this software.
-
-> **NOTE**: OpenVINO™ Development Tools package has been deprecated and will be discontinued with 2024.0 release. To learn more, refer to the [OpenVINO Legacy Features and Components page](https://docs.openvino.ai/2023.1/openvino_legacy_features.html).
+> **NOTE**: OpenVINO™ Development Tools package has been deprecated and will be discontinued with 2024.0 release. To learn more, refer to the [OpenVINO Legacy Features and Components page](https://docs.openvino.ai/2023.2/openvino_legacy_features.html).
 
 Intel® Distribution of OpenVINO™ toolkit is an open-source toolkit for optimizing and deploying AI inference. It can be used to develop applications and solutions based on deep learning tasks, such as: emulation of human vision, automatic speech recognition, natural language processing, recommendation systems, etc. It provides high-performance and rich deployment options, from edge to cloud.
 
@@ -121,14 +118,14 @@ For example, to install and configure the components for working with TensorFlow
 
 | Component        | Console Script                                                                   | Description                                                                                                                                                                                                                                                                                                   |
 |------------------|---------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [Legacy Model conversion API](https://docs.openvino.ai/nightly/openvino_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html) | `mo` |**Model conversion API** imports, converts, and optimizes models that were trained in popular frameworks to a format usable by OpenVINO components. <br>Supported frameworks include Caffe\*, TensorFlow\*, MXNet\*, PaddlePaddle\*, and ONNX\*.                                               |                                         |
-| [Accuracy Checker](https://docs.openvino.ai/nightly/omz_tools_accuracy_checker.html) and <br> [Annotation Converter](https://docs.openvino.ai/nightly/omz_tools_accuracy_checker_annotation_converters.html) | `accuracy_check` <br> `convert_annotation` |**Accuracy Checker**  is a deep learning accuracy validation tool that allows you to collect accuracy metrics against popular datasets. The main advantages of the tool are the flexibility of configuration and a set of supported datasets, preprocessing, postprocessing, and metrics. <br> **Annotation Converter** is a utility that prepares datasets for evaluation with Accuracy Checker.                                             |
-| [Post-Training Optimization Tool](https://docs.openvino.ai/nightly/pot_introduction.html)| `pot` |**Post-Training Optimization Tool** allows you to optimize trained models with advanced capabilities, such as quantization and low-precision optimizations, without the need to retrain or fine-tune models.                                            |
-| [Model Downloader and other Open Model Zoo tools](https://docs.openvino.ai/nightly/omz_tools_downloader.html)| `omz_downloader` <br> `omz_converter` <br> `omz_quantizer` <br> `omz_info_dumper`| **Model Downloader** is a tool for getting access to the collection of high-quality and extremely fast pre-trained deep learning [public](@ref omz_models_group_public) and [Intel](@ref omz_models_group_intel)-trained models. These free pre-trained models can be used to speed up the development and production deployment process without training your own models. The tool downloads model files from online sources and, if necessary, patches them to make them more usable with model conversion API. A number of additional tools are also provided to automate the process of working with downloaded models:<br> **Model Converter** is a tool for converting Open Model Zoo models that are stored in an original deep learning framework format into the OpenVINO Intermediate Representation (IR) using model conversion API. <br> **Model Quantizer** is a tool for automatic quantization of full-precision models in the IR format into low-precision versions using the Post-Training Optimization Tool. <br> **Model Information Dumper** is a helper utility for dumping information about the models to a stable, machine-readable format.                                          |
+| [Legacy Model conversion API](https://docs.openvino.ai/2023.2/openvino_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html) | `mo` |**Model conversion API** imports, converts, and optimizes models that were trained in popular frameworks to a format usable by OpenVINO components. <br>Supported frameworks include Caffe\*, TensorFlow\*, MXNet\*, PaddlePaddle\*, and ONNX\*.                                               |                                         |
+| [Accuracy Checker](https://docs.openvino.ai/2023.2/omz_tools_accuracy_checker.html) and <br> [Annotation Converter](https://docs.openvino.ai/2023.2/omz_tools_accuracy_checker_annotation_converters.html) | `accuracy_check` <br> `convert_annotation` |**Accuracy Checker**  is a deep learning accuracy validation tool that allows you to collect accuracy metrics against popular datasets. The main advantages of the tool are the flexibility of configuration and a set of supported datasets, preprocessing, postprocessing, and metrics. <br> **Annotation Converter** is a utility that prepares datasets for evaluation with Accuracy Checker.                                             |
+| [Post-Training Optimization Tool](https://docs.openvino.ai/2023.2/pot_introduction.html)| `pot` |**Post-Training Optimization Tool** allows you to optimize trained models with advanced capabilities, such as quantization and low-precision optimizations, without the need to retrain or fine-tune models.                                            |
+| [Model Downloader and other Open Model Zoo tools](https://docs.openvino.ai/2023.2/omz_tools_downloader.html)| `omz_downloader` <br> `omz_converter` <br> `omz_quantizer` <br> `omz_info_dumper`| **Model Downloader** is a tool for getting access to the collection of high-quality and extremely fast pre-trained deep learning [public](@ref omz_models_group_public) and [Intel](@ref omz_models_group_intel)-trained models. These free pre-trained models can be used to speed up the development and production deployment process without training your own models. The tool downloads model files from online sources and, if necessary, patches them to make them more usable with model conversion API. A number of additional tools are also provided to automate the process of working with downloaded models:<br> **Model Converter** is a tool for converting Open Model Zoo models that are stored in an original deep learning framework format into the OpenVINO Intermediate Representation (IR) using model conversion API. <br> **Model Quantizer** is a tool for automatic quantization of full-precision models in the IR format into low-precision versions using the Post-Training Optimization Tool. <br> **Model Information Dumper** is a helper utility for dumping information about the models to a stable, machine-readable format.                                          |
 
 ## Troubleshooting
 
-For general troubleshooting steps and issues, see [Troubleshooting Guide for OpenVINO Installation](https://docs.openvino.ai/2023.1/openvino_docs_get_started_guide_troubleshooting.html). The following sections also provide explanations to several error messages.
+For general troubleshooting steps and issues, see [Troubleshooting Guide for OpenVINO Installation](https://docs.openvino.ai/2023.2/openvino_docs_get_started_guide_troubleshooting.html). The following sections also provide explanations to several error messages.
 
 ### Errors with Installing via PIP for Users in China
 
diff --git a/docs/install_guides/pypi-openvino-rt.md b/docs/install_guides/pypi-openvino-rt.md
index c5d8dc0de156b6..8b5ec0c878916a 100644
--- a/docs/install_guides/pypi-openvino-rt.md
+++ b/docs/install_guides/pypi-openvino-rt.md
@@ -1,11 +1,8 @@
 # OpenVINO™ 
 
-<!--- The note below is intended for master branch only for pre-release purpose. Remove it for official releases. --->
-> **NOTE**: This version is pre-release software and has not undergone full release validation or qualification. No support is offered on pre-release software and APIs/behavior are subject to change. It should NOT be incorporated into any production software/solution and instead should be used only for early testing and integration while awaiting a final release version of this software.
-
 Intel® Distribution of OpenVINO™ toolkit is an open-source toolkit for optimizing and deploying AI inference. It can be used to develop applications and solutions based on deep learning tasks, such as: emulation of human vision, automatic speech recognition, natural language processing, recommendation systems, etc. It provides high-performance and rich deployment options, from edge to cloud.
 
-If you have already finished developing your models and converting them to the OpenVINO model format, you can install OpenVINO Runtime to deploy your applications on various devices. The [OpenVINO™](https://docs.openvino.ai/2023.1/openvino_docs_OV_UG_OV_Runtime_User_Guide.html) Python package includes a set of libraries for an easy inference integration with your products.
+If you have already finished developing your models and converting them to the OpenVINO model format, you can install OpenVINO Runtime to deploy your applications on various devices. The [OpenVINO™](https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_OV_Runtime_User_Guide.html) Python package includes a set of libraries for an easy inference integration with your products.
 
 ## System Requirements
 
@@ -75,13 +72,13 @@ If installation was successful, you will see the list of available devices.
 
 | Component        | Content                                                                  | Description                                                                                                                                                                                                                                                                                                   |
 |------------------|---------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [OpenVINO Runtime](https://docs.openvino.ai/2023.1/openvino_docs_OV_UG_OV_Runtime_User_Guide.html) | `openvino package` |**OpenVINO Runtime**  is a set of C++ libraries with C and Python bindings providing a common API to deliver inference solutions on the platform of your choice. Use the OpenVINO Runtime API to read PyTorch\*, TensorFlow\*, TensorFlow Lite\*, ONNX\*, and PaddlePaddle\* models and execute them on preferred devices. OpenVINO Runtime uses a plugin architecture and includes the following plugins: [CPU](https://docs.openvino.ai/2023.1/openvino_docs_OV_UG_supported_plugins_CPU.html), [GPU](https://docs.openvino.ai/2023.1/openvino_docs_OV_UG_supported_plugins_GPU.html), [Auto Batch](https://docs.openvino.ai/2023.1/openvino_docs_OV_UG_Automatic_Batching.html), [Auto](https://docs.openvino.ai/2023.1/openvino_docs_OV_UG_supported_plugins_AUTO.html), [Hetero](https://docs.openvino.ai/2023.1/openvino_docs_OV_UG_Hetero_execution.html).       
-| [OpenVINO Model Converter (OVC)](https://docs.openvino.ai/2023.1/openvino_docs_model_processing_introduction.html#convert-a-model-in-cli-ovc) | `ovc` |**OpenVINO Model Converter**  converts models that were trained in popular frameworks to a format usable by OpenVINO components. <br>Supported frameworks include ONNX\*, TensorFlow\*, TensorFlow Lite\*, and PaddlePaddle\*.                                    |
-| [Benchmark Tool](https://docs.openvino.ai/2023.1/openvino_inference_engine_tools_benchmark_tool_README.html)| `benchmark_app` | **Benchmark Application** allows you to estimate deep learning inference performance on supported devices for synchronous and asynchronous modes.                                              |
+| [OpenVINO Runtime](https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_OV_Runtime_User_Guide.html) | `openvino package` |**OpenVINO Runtime**  is a set of C++ libraries with C and Python bindings providing a common API to deliver inference solutions on the platform of your choice. Use the OpenVINO Runtime API to read PyTorch\*, TensorFlow\*, TensorFlow Lite\*, ONNX\*, and PaddlePaddle\* models and execute them on preferred devices. OpenVINO Runtime uses a plugin architecture and includes the following plugins: [CPU](https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_CPU.html), [GPU](https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_GPU.html), [Auto Batch](https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_Automatic_Batching.html), [Auto](https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO.html), [Hetero](https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_Hetero_execution.html).       
+| [OpenVINO Model Converter (OVC)](https://docs.openvino.ai/2023.2/openvino_docs_model_processing_introduction.html#convert-a-model-in-cli-ovc) | `ovc` |**OpenVINO Model Converter**  converts models that were trained in popular frameworks to a format usable by OpenVINO components. <br>Supported frameworks include ONNX\*, TensorFlow\*, TensorFlow Lite\*, and PaddlePaddle\*.                                    |
+| [Benchmark Tool](https://docs.openvino.ai/2023.2/openvino_inference_engine_tools_benchmark_tool_README.html)| `benchmark_app` | **Benchmark Application** allows you to estimate deep learning inference performance on supported devices for synchronous and asynchronous modes.                                              |
 
 ## Troubleshooting
 
-For general troubleshooting steps and issues, see [Troubleshooting Guide for OpenVINO Installation](https://docs.openvino.ai/2023.1/openvino_docs_get_started_guide_troubleshooting.html). The following sections also provide explanations to several error messages. 
+For general troubleshooting steps and issues, see [Troubleshooting Guide for OpenVINO Installation](https://docs.openvino.ai/2023.2/openvino_docs_get_started_guide_troubleshooting.html). The following sections also provide explanations to several error messages. 
 
 ### Errors with Installing via PIP for Users in China
 

From df19f18f373df096590473c48c891617a6c6990f Mon Sep 17 00:00:00 2001
From: Alina Kladieva <alina.kladieva@intel.com>
Date: Tue, 31 Oct 2023 17:39:41 +0100
Subject: [PATCH 8/9] Update refs for public CI pipelines (#20798)

---
 .ci/azure/linux_coverity.yml                          | 2 +-
 .ci/azure/windows.yml                                 | 2 +-
 .ci/azure/windows_conditional_compilation.yml         | 2 +-
 .github/workflows/linux.yml                           | 6 +++---
 .github/workflows/linux_conditional_compilation.yml   | 4 ++--
 .github/workflows/windows.yml                         | 2 +-
 .github/workflows/windows_conditional_compilation.yml | 4 ++--
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.ci/azure/linux_coverity.yml b/.ci/azure/linux_coverity.yml
index eeac80d8190425..5f9ffc278b26b4 100644
--- a/.ci/azure/linux_coverity.yml
+++ b/.ci/azure/linux_coverity.yml
@@ -4,7 +4,7 @@ resources:
     type: github
     endpoint: openvinotoolkit
     name: openvinotoolkit/openvino_contrib
-    ref: master
+    ref: releases/2023/2
 
 variables:
   - group: github
diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml
index 144c605995e367..0f25d77a1c98da 100644
--- a/.ci/azure/windows.yml
+++ b/.ci/azure/windows.yml
@@ -32,7 +32,7 @@ resources:
     type: github
     endpoint: openvinotoolkit
     name: openvinotoolkit/openvino_contrib
-    ref: master
+    ref: releases/2023/2
 
 jobs:
 - job: Win
diff --git a/.ci/azure/windows_conditional_compilation.yml b/.ci/azure/windows_conditional_compilation.yml
index 3d2df492194950..ee886e5c8f4630 100644
--- a/.ci/azure/windows_conditional_compilation.yml
+++ b/.ci/azure/windows_conditional_compilation.yml
@@ -35,7 +35,7 @@ resources:
     type: github
     endpoint: openvinotoolkit
     name: openvinotoolkit/testdata
-    ref: master
+    ref: releases/2023/2
 
 variables:
   - group: github
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 6b4328f77dc3fe..43115466b50593 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -79,7 +79,7 @@ jobs:
           repository: 'openvinotoolkit/openvino_contrib'
           path: ${{ env.OPENVINO_CONTRIB_REPO }}
           submodules: 'true'
-          ref: 'master'
+          ref: 'releases/2023/2'
 
       #
       # Print system info
@@ -540,7 +540,7 @@ jobs:
             install_build_dependencies.sh
           sparse-checkout-cone-mode: false
           path: ${{ env.OPENVINO_REPO }}
-          ref: 'master'
+          ref: 'releases/2023/2'
 
       - name: Install git
         run: |
@@ -1443,7 +1443,7 @@ jobs:
         with:
           repository: 'openvinotoolkit/openvino_contrib'
           path: ${{ env.OPENVINO_CONTRIB_REPO }}
-          ref: 'master'
+          ref: 'releases/2023/2'
 
       #
       # Dependencies
diff --git a/.github/workflows/linux_conditional_compilation.yml b/.github/workflows/linux_conditional_compilation.yml
index 7fac5d9a1bde98..b3ee3fc5cfe72f 100644
--- a/.github/workflows/linux_conditional_compilation.yml
+++ b/.github/workflows/linux_conditional_compilation.yml
@@ -73,7 +73,7 @@ jobs:
           repository: 'openvinotoolkit/testdata'
           path: ${{ env.MODELS_PATH }}
           lfs: 'true'
-          ref: 'master'
+          ref: 'releases/2023/2'
 
       #
       # Print system info
@@ -243,7 +243,7 @@ jobs:
           repository: 'openvinotoolkit/testdata'
           path: ${{ env.MODELS_PATH }}
           lfs: 'true'
-          ref: 'master'
+          ref: 'releases/2023/2'
 
       - name: Download selective build statistics package
         uses: actions/download-artifact@v3
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 4984826bfba7fd..ab9b503413cbe2 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -60,7 +60,7 @@ jobs:
         with:
           repository: 'openvinotoolkit/openvino_contrib'
           path: 'openvino_contrib'
-          ref: 'master'
+          ref: 'releases/2023/2'
 
       #
       # Print system info
diff --git a/.github/workflows/windows_conditional_compilation.yml b/.github/workflows/windows_conditional_compilation.yml
index 976daa18272796..99e83de57b191a 100644
--- a/.github/workflows/windows_conditional_compilation.yml
+++ b/.github/workflows/windows_conditional_compilation.yml
@@ -62,7 +62,7 @@ jobs:
           repository: 'openvinotoolkit/testdata'
           path: 'testdata'
           lfs: 'true'
-          ref: 'master'
+          ref: 'releases/2023/2'
 
       #
       # Print system info
@@ -221,7 +221,7 @@ jobs:
           repository: 'openvinotoolkit/testdata'
           path: 'testdata'
           lfs: 'true'
-          ref: 'master'
+          ref: 'releases/2023/2'
 
       - name: Download selective build statistics package
         uses: actions/download-artifact@v3

From 757b466c5df9b0814709fecba1b10deabd8ec647 Mon Sep 17 00:00:00 2001
From: Gorokhov Dmitriy <dmitry.gorokhov@intel.com>
Date: Thu, 2 Nov 2023 11:31:20 +0400
Subject: [PATCH 9/9] [CPU] Fixed port mismatch in Eltwise fusion graph
 optimization (#20807)

---
 src/plugins/intel_cpu/src/graph_optimizer.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp
index cf79a5b79f299d..fe8dae30e84405 100644
--- a/src/plugins/intel_cpu/src/graph_optimizer.cpp
+++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp
@@ -2081,7 +2081,9 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) {
                     graphEdges.push_back(newEdge);
                     parent->addEdge(newEdge);
 
-                    parentNode->inputShapes.push_back(parent->getOutputShapeAtPort(inNum));
+                    if (parentNode->inputShapes.size() < static_cast<size_t>(outNum + 1))
+                        parentNode->inputShapes.resize(outNum + 1);
+                    parentNode->inputShapes[outNum] = parent->getOutputShapeAtPort(inNum);
                 }
             }