From 205c11e0a8aa8d90f911f26689611c11db0751c7 Mon Sep 17 00:00:00 2001
From: Bogdan Pereanu <bogdan.pereanu@intel.com>
Date: Wed, 2 Oct 2024 17:28:33 +0300
Subject: [PATCH 1/3] [NPU] Add documentation for batching on NPU plugin
 (#26865)

### Details:
 - *Add documentation for batching on NPU plugin*
 - *...*

### Tickets:
 - *EISW-118045*

---------

Co-authored-by: Karol Blaszczak <karol.blaszczak@intel.com>
Co-authored-by: Tatiana Savina <tatiana.savina@intel.com>
---
 .../npu-device.rst                            |  1 +
 .../npu-device/batching-on-npu-plugin.rst     | 37 +++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst

diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst
index d9f5e25c332984..7b135fa7ff0b14 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst
@@ -11,6 +11,7 @@ NPU Device
    :hidden:
 
    npu-device/remote-tensor-api-npu-plugin
+   npu-device/batching-on-npu-plugin
 
 
 The Neural Processing Unit is a low-power hardware solution, introduced with the
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst
new file mode 100644
index 00000000000000..379822e327c8cd
--- /dev/null
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst
@@ -0,0 +1,37 @@
+NPU Plugin Batching 
+===============================
+
+
+.. meta::
+   :description: OpenVINO™ NPU plugin supports batching
+                 either by executing concurrent inferences or by
+                 relying on native compiler support for batching.
+
+OpenVINO™ NPU plugin supports batching either by executing concurrent inferences or by relying on native compiler support for batching.
+
+First, the NPU plugin checks if the following conditions are met:
+
+* The batch size is on the first axis.
+* All inputs and outputs have the same batch size.
+* The model does not contain states.
+
+**If the conditions are met**, the NPU plugin attempts to compile and execute the original model with batch_size forced to 1. This approach is due to current compiler limitations and ongoing work to improve performance for batch_size greater than one.
+If the compilation is successful, the plugin detects a difference in batch size between the original model layout (with a batch size set to N)
+and the transformed/compiled layout (with a batch size set to 1). Then it executes the following steps:
+
+1. Internally constructs multiple command lists, one for each input.
+2. Executes each command list for the proper offsets of input/output buffers.
+3. Notifies the user of the completion of the inference request after all command lists have been executed.
+
+This concurrency-based batching mode is transparent to the application. A single inference request handles all inputs from the batch.
+While performance may be lower compared to regular batching (based on native compiler support), this mode provides basic batching functionality for use either with older drivers
+or when the model cannot yet be compiled with a batch size larger than one.
+
+**If the conditions are not met**, the NPU plugin tries to compile and execute the original model with the given
+batch_size to N as any other regular model.
+
+.. note::
+
+   With future performance improvements and support for compiling multiple models with a batch size larger 
+   than one, the default order will change. NPU will try first to compile and execute the original model with the 
+   given batch size and fall back to concurrent batching if compilation fails.

From 0b21860e705322705142c148dc49d5bc64ebdb1d Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Wed, 2 Oct 2024 18:31:28 +0200
Subject: [PATCH 2/3] NPUW: Fix nullptr reference parameter in
 rearrange_to_proto (#26869)

### Details:
- *Add subgraph identificator for operation name, so one operation in
two subgraphs can be distinguished as two different operations*
 - *...*

### Tickets:
 - *EISW-139849*

---------

Co-authored-by: Dmitry Matveev <dmitry.matveev@intel.com>
---
 .../src/plugin/npuw/compiled_model.cpp        |  1 +
 .../plugin/npuw/partitioning/partitioning.cpp | 58 ++++++++++++++-----
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 43cb5ec1aef931..563e99fcf2bad9 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -178,6 +178,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
         }
         auto process_params = [&](const ov::ParameterVector& _parameters) {
             for (size_t i = 0; i < _parameters.size(); i++) {
+                NPUW_ASSERT(_parameters[i]);
                 LOG_VERB(_parameters[i]);
                 for (size_t j = 0; j < orig_parameters.size(); j++) {
                     if (_parameters[i] == orig_parameters[j]) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index 22dfc6e103f719..192d975509ce5e 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -4,6 +4,8 @@
 
 #include "partitioning.hpp"
 
+#include <memory>
+
 #include "../logging.hpp"
 #include "../util.hpp"
 #include "intel_npu/al/config/npuw.hpp"
@@ -20,6 +22,26 @@
 #include "patterns/dcoff.hpp"
 #include "patterns/opt.hpp"
 
+namespace ov {
+namespace npuw {
+inline bool operator==(const std::reference_wrapper<Subgraph>& lhs, const std::reference_wrapper<Subgraph>& rhs) {
+    ov::npuw::Subgraph& llink = lhs.get();
+    ov::npuw::Subgraph& rlink = rhs.get();
+    return &llink == &rlink;
+}
+}  // namespace npuw
+}  // namespace ov
+
+template <typename T2>
+struct std::hash<std::pair<ov::npuw::Subgraph::Ref, T2>> {
+    std::size_t operator()(std::pair<ov::npuw::Subgraph::Ref, T2> const& p) const noexcept {
+        ov::npuw::Subgraph& sg = p.first.get();
+        std::size_t h1 = std::hash<void*>{}(&sg);
+        std::size_t h2 = std::hash<T2>{}(p.second);
+        return h1 ^ (h2 << 1);
+    }
+};
+
 namespace {
 
 class FuncallEverywhere {
@@ -161,6 +183,8 @@ class Partitioner {
 
     using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
     using RPtr = std::shared_ptr<ov::op::v0::Result>;
+    using SubgParam = std::pair<ov::npuw::Subgraph::Ref, PPtr>;
+    using SubgResult = std::pair<ov::npuw::Subgraph::Ref, RPtr>;
     using LinkPtrTo = std::pair<size_t /*submodel_idx*/
                                 ,
                                 PPtr /*param ptr*/
@@ -182,8 +206,8 @@ class Partitioner {
 
         // Map every function call instance' Parameter and result
         // back to its prototype Parameter and Result
-        std::unordered_map<PPtr, PPtr> param_call_to_proto;
-        std::unordered_map<RPtr, RPtr> result_call_to_proto;
+        std::unordered_map<SubgParam, PPtr> param_call_to_proto;
+        std::unordered_map<SubgResult, RPtr> result_call_to_proto;
     };
     std::map<std::string, FunctionPipeline> all_functions;
 
@@ -203,7 +227,10 @@ class Partitioner {
     void createFunction(FunctionPipeline& func_ggg);
 
     template <typename T, typename M>
-    void rearrange_to_function_protocol(const std::vector<T>& protocol, std::vector<T>& call, const M& call_to_proto) {
+    void rearrange_to_function_protocol(ov::npuw::Subgraph::Ref func_ref,
+                                        const std::vector<T>& protocol,
+                                        std::vector<T>& call,
+                                        const M& call_to_proto) {
         LOG_DEBUG("Rearranging...");
         LOG_BLOCK();
         LOG_DEBUG("Protocol: " << protocol.size());
@@ -215,7 +242,7 @@ class Partitioner {
         LOG_DEBUG("Call: " << call.size());
         for (auto&& c : call) {
             LOG_BLOCK();
-            auto p_c = call_to_proto.at(c);
+            auto p_c = call_to_proto.at(typename M::key_type(func_ref, c));
             to_proto.push_back(p_c);
             LOG_DEBUG(c << " (which is " << p_c << ")");
         }
@@ -536,7 +563,7 @@ void Partitioner::identifySubgraphs() {
             LOG_VERB("Processing group's output layer " << output_layer_name);
             LOG_BLOCK();
             auto output_layer_ptr = node_id_cache.at(output_layer_name);
-            if (output_layer_ptr->inputs().empty()) {
+            if (output_layer_ptr->outputs().empty()) {
                 OPENVINO_THROW("The group's output layer ",
                                output_layer_name,
                                " has NO OUTPUTS!! - Graph contracts are broken??");
@@ -1327,9 +1354,12 @@ void Partitioner::matchParameters(const std::string& func_name) {
 
     // Now walk other submodels and match parameters with the same key
     // (yes, including the first one)
-    for (auto&& call : model_group) {
+    for (std::size_t call_id = 0; call_id < model_group.size(); ++call_id) {
         LOG_DEBUG("Handle function call...");
         LOG_BLOCK();
+        auto call = model_group[call_id];
+        auto subg_ref = func.refs[call_id];
+
         std::unordered_set<ov::Node*> this_model_nodes;
         for (auto&& node_ptr : call->get_ordered_ops()) {
             this_model_nodes.insert(node_ptr.get());
@@ -1348,7 +1378,7 @@ void Partitioner::matchParameters(const std::string& func_name) {
                 LOG_DEBUG("Find orig parameter for " << node);
                 auto& orig_param = proto_parameters.at(pkey);
                 auto this_param = std::dynamic_pointer_cast<PPtr::element_type>(node);
-                func.param_call_to_proto[this_param] = orig_param;
+                func.param_call_to_proto[SubgParam(subg_ref, this_param)] = orig_param;
             }
         }
     }
@@ -1386,14 +1416,16 @@ void Partitioner::matchResults(const std::string& func_name) {
 
     // Now walk all submodels and match parameters with the same key
     // (yes, including the first one)
-    for (auto&& call : model_group) {
+    for (std::size_t call_idx = 0; call_idx < model_group.size(); ++call_idx) {
+        auto call = model_group[call_idx];
+        auto subg_ref = func.refs[call_idx];
         for (auto&& node : call->get_ordered_ops()) {
             if (ov::op::util::is_output(node)) {
                 auto&& port = node->input(0).get_source_output();
                 RKey rkey = {layer_to_prototype.at(port.get_node()->get_friendly_name()), port.get_index()};
                 auto& orig_result = proto_results.at(rkey);
                 auto this_result = std::dynamic_pointer_cast<RPtr::element_type>(node);
-                func.result_call_to_proto[this_result] = orig_result;
+                func.result_call_to_proto[SubgResult(subg_ref, this_result)] = orig_result;
             }
         }
     }
@@ -1517,8 +1549,8 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) {
         funcall._gflops = this_sg._gflops;          // duplicated code again!
         funcall._ops = this_sg._ops;                // duplicated code again!
         funcall._avoid_list = this_sg._avoid_list;  // duplicated code again!
-        rearrange_to_function_protocol(body_params, funcall._parameters, func_ggg.param_call_to_proto);
-        rearrange_to_function_protocol(body_results, funcall._results, func_ggg.result_call_to_proto);
+        rearrange_to_function_protocol(this_sg, body_params, funcall._parameters, func_ggg.param_call_to_proto);
+        rearrange_to_function_protocol(this_sg, body_results, funcall._results, func_ggg.result_call_to_proto);
 
         auto func_iter = P.functions.find(func_name);
         NPUW_ASSERT(func_iter != P.functions.end());
@@ -1883,7 +1915,7 @@ void Partitioner::finalizeLinks() {
             auto& params = P.functions.at(sg_desc._funcall)._model->get_parameters();
             auto& proto = func_pipeline_type == FunctionPipelineType::CWAI
                               ? ptr  // no protos in the CWAI case..
-                              : all_functions.at(sg_desc._funcall).param_call_to_proto.at(ptr);
+                              : all_functions.at(sg_desc._funcall).param_call_to_proto.at(SubgParam(sg_desc, ptr));
             auto param_iter = std::find(params.begin(), params.end(), proto);
             NPUW_ASSERT(param_iter != params.end());
             return std::distance(params.begin(), param_iter);
@@ -1904,7 +1936,7 @@ void Partitioner::finalizeLinks() {
             auto& results = P.functions.at(sg_desc._funcall)._model->get_results();
             auto& proto = func_pipeline_type == FunctionPipelineType::CWAI
                               ? ptr  // no protos in the CWAI case...
-                              : all_functions.at(sg_desc._funcall).result_call_to_proto.at(ptr);
+                              : all_functions.at(sg_desc._funcall).result_call_to_proto.at(SubgResult(sg_desc, ptr));
             auto result_iter = std::find(results.begin(), results.end(), proto);
             NPUW_ASSERT(result_iter != results.end());
             return std::distance(results.begin(), result_iter);

From cddcfe83c6f77cb48443907180dea97c22f6d625 Mon Sep 17 00:00:00 2001
From: Andrei Beleiu <andrei-marin.beleiu@intel.com>
Date: Wed, 2 Oct 2024 20:55:21 +0300
Subject: [PATCH 3/3] single-image-test: added accuracy workaround for
 computing CPU reference and adjust the NRMSE and PSNR metrics   (#26020)

### Details:
Small improvements for SIT:
- Disable [dynamic quantization
feature](https://github.com/openvinotoolkit/openvino/blob/master/src/inference/include/openvino/runtime/properties.hpp#L574),
which is used by default by CPU pipeline to generate the reference
outputs and affects accuracy for some particular models
 - Increase number of decimals for reported NRMSE metric to improve
- For PSNR metric report fails only when the accuracy decrease not when
it increase above the target

### Tickets:
 - *[CVS-143420](https://jira.devtools.intel.com/browse/CVS-143420)*
---
 src/plugins/intel_npu/tools/single-image-test/main.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_npu/tools/single-image-test/main.cpp b/src/plugins/intel_npu/tools/single-image-test/main.cpp
index 3b3009bb5f459c..4018982b022ed3 100644
--- a/src/plugins/intel_npu/tools/single-image-test/main.cpp
+++ b/src/plugins/intel_npu/tools/single-image-test/main.cpp
@@ -1200,7 +1200,8 @@ bool computeRRMSE(const ov::Tensor& output, const ov::Tensor& reference) {
 
     double rrmseLoss = sqrt(error / sum);
 
-    std::cout << "RRMSE loss : " << rrmseLoss << "   RRMSE threshold : " << FLAGS_rrmse_loss_threshold << std::endl;
+    std::cout << "RRMSE loss : " << std::fixed << std::setprecision(4) << rrmseLoss
+              << "   RRMSE threshold : " << FLAGS_rrmse_loss_threshold << std::endl;
     return rrmseLoss <= FLAGS_rrmse_loss_threshold;
 }
 
@@ -1267,7 +1268,8 @@ bool computeNRMSE(const ov::Tensor& output, const ov::Tensor& reference) {
     double nrmseLoss =
             sqrt(error / size) / std::max(0.001f, std::max(maxOutput - minOutput, maxReference - minReference));
 
-    std::cout << "NRMSE loss : " << nrmseLoss << "   NRMSE threshold : " << FLAGS_nrmse_loss_threshold << std::endl;
+    std::cout << "NRMSE loss : " << std::fixed << std::setprecision(4) << nrmseLoss
+              << "   NRMSE threshold : " << FLAGS_nrmse_loss_threshold << std::endl;
     return nrmseLoss <= FLAGS_nrmse_loss_threshold;
 }
 
@@ -1319,7 +1321,7 @@ bool testPSNR(const TensorMap& outputs, const TensorMap& references, const int d
 
     auto result = utils::runPSNRMetric(actOutput, refOutput, dstHeight, dstWidth, scaleBorder, normalizedImage);
 
-    if (std::fabs(result - FLAGS_psnr_reference) > FLAGS_psnr_tolerance) {
+    if (FLAGS_psnr_reference - result > FLAGS_psnr_tolerance) {
         std::cout << "Absolute difference between actual value " << result << " and reference value "
                   << FLAGS_psnr_reference << " larger then tolerance " << FLAGS_psnr_tolerance << std::endl;
         return false;