From 205c11e0a8aa8d90f911f26689611c11db0751c7 Mon Sep 17 00:00:00 2001 From: Bogdan Pereanu Date: Wed, 2 Oct 2024 17:28:33 +0300 Subject: [PATCH 1/3] [NPU] Add documentation for batching on NPU plugin (#26865) ### Details: - *Add documentation for batching on NPU plugin* - *...* ### Tickets: - *EISW-118045* --------- Co-authored-by: Karol Blaszczak Co-authored-by: Tatiana Savina --- .../npu-device.rst | 1 + .../npu-device/batching-on-npu-plugin.rst | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst index d9f5e25c332984..7b135fa7ff0b14 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst @@ -11,6 +11,7 @@ NPU Device :hidden: npu-device/remote-tensor-api-npu-plugin + npu-device/batching-on-npu-plugin The Neural Processing Unit is a low-power hardware solution, introduced with the diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst new file mode 100644 index 00000000000000..379822e327c8cd --- /dev/null +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst @@ -0,0 +1,37 @@ +NPU Plugin Batching +=============================== + + +.. meta:: + :description: OpenVINO™ NPU plugin supports batching + either by executing concurrent inferences or by + relying on native compiler support for batching. + +OpenVINO™ NPU plugin supports batching either by executing concurrent inferences or by relying on native compiler support for batching. + +First, the NPU plugin checks if the following conditions are met: + +* The batch size is on the first axis. +* All inputs and outputs have the same batch size. +* The model does not contain states. + +**If the conditions are met**, the NPU plugin attempts to compile and execute the original model with batch_size forced to 1. This approach is due to current compiler limitations and ongoing work to improve performance for batch_size greater than one. +If the compilation is successful, the plugin detects a difference in batch size between the original model layout (with a batch size set to N) +and the transformed/compiled layout (with a batch size set to 1). Then it executes the following steps: + +1. Internally constructs multiple command lists, one for each input. +2. Executes each command list for the proper offsets of input/output buffers. +3. Notifies the user of the completion of the inference request after all command lists have been executed. + +This concurrency-based batching mode is transparent to the application. A single inference request handles all inputs from the batch. +While performance may be lower compared to regular batching (based on native compiler support), this mode provides basic batching functionality for use either with older drivers +or when the model cannot yet be compiled with a batch size larger than one. + +**If the conditions are not met**, the NPU plugin tries to compile and execute the original model with the given +batch_size to N as any other regular model. + +.. note:: + + With future performance improvements and support for compiling multiple models with a batch size larger + than one, the default order will change. NPU will try first to compile and execute the original model with the + given batch size and fall back to concurrent batching if compilation fails. From 0b21860e705322705142c148dc49d5bc64ebdb1d Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Wed, 2 Oct 2024 18:31:28 +0200 Subject: [PATCH 2/3] NPUW: Fix nullptr reference parameter in rearrange_to_proto (#26869) ### Details: - *Add subgraph identificator for operation name, so one operation in two subgraphs can be distinguished as two different operations* - *...* ### Tickets: - *EISW-139849* --------- Co-authored-by: Dmitry Matveev --- .../src/plugin/npuw/compiled_model.cpp | 1 + .../plugin/npuw/partitioning/partitioning.cpp | 58 ++++++++++++++----- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 43cb5ec1aef931..563e99fcf2bad9 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -178,6 +178,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, } auto process_params = [&](const ov::ParameterVector& _parameters) { for (size_t i = 0; i < _parameters.size(); i++) { + NPUW_ASSERT(_parameters[i]); LOG_VERB(_parameters[i]); for (size_t j = 0; j < orig_parameters.size(); j++) { if (_parameters[i] == orig_parameters[j]) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 22dfc6e103f719..192d975509ce5e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -4,6 +4,8 @@ #include "partitioning.hpp" +#include + #include "../logging.hpp" #include "../util.hpp" #include "intel_npu/al/config/npuw.hpp" @@ -20,6 +22,26 @@ #include "patterns/dcoff.hpp" #include "patterns/opt.hpp" +namespace ov { +namespace npuw { +inline bool operator==(const std::reference_wrapper& lhs, const std::reference_wrapper& rhs) { + ov::npuw::Subgraph& llink = lhs.get(); + ov::npuw::Subgraph& rlink = rhs.get(); + return &llink == &rlink; +} +} // namespace npuw +} // namespace ov + +template +struct std::hash> { + std::size_t operator()(std::pair const& p) const noexcept { + ov::npuw::Subgraph& sg = p.first.get(); + std::size_t h1 = std::hash{}(&sg); + std::size_t h2 = std::hash{}(p.second); + return h1 ^ (h2 << 1); + } +}; + namespace { class FuncallEverywhere { @@ -161,6 +183,8 @@ class Partitioner { using PPtr = std::shared_ptr; using RPtr = std::shared_ptr; + using SubgParam = std::pair; + using SubgResult = std::pair; using LinkPtrTo = std::pair param_call_to_proto; - std::unordered_map result_call_to_proto; + std::unordered_map param_call_to_proto; + std::unordered_map result_call_to_proto; }; std::map all_functions; @@ -203,7 +227,10 @@ class Partitioner { void createFunction(FunctionPipeline& func_ggg); template - void rearrange_to_function_protocol(const std::vector& protocol, std::vector& call, const M& call_to_proto) { + void rearrange_to_function_protocol(ov::npuw::Subgraph::Ref func_ref, + const std::vector& protocol, + std::vector& call, + const M& call_to_proto) { LOG_DEBUG("Rearranging..."); LOG_BLOCK(); LOG_DEBUG("Protocol: " << protocol.size()); @@ -215,7 +242,7 @@ class Partitioner { LOG_DEBUG("Call: " << call.size()); for (auto&& c : call) { LOG_BLOCK(); - auto p_c = call_to_proto.at(c); + auto p_c = call_to_proto.at(typename M::key_type(func_ref, c)); to_proto.push_back(p_c); LOG_DEBUG(c << " (which is " << p_c << ")"); } @@ -536,7 +563,7 @@ void Partitioner::identifySubgraphs() { LOG_VERB("Processing group's output layer " << output_layer_name); LOG_BLOCK(); auto output_layer_ptr = node_id_cache.at(output_layer_name); - if (output_layer_ptr->inputs().empty()) { + if (output_layer_ptr->outputs().empty()) { OPENVINO_THROW("The group's output layer ", output_layer_name, " has NO OUTPUTS!! - Graph contracts are broken??"); @@ -1327,9 +1354,12 @@ void Partitioner::matchParameters(const std::string& func_name) { // Now walk other submodels and match parameters with the same key // (yes, including the first one) - for (auto&& call : model_group) { + for (std::size_t call_id = 0; call_id < model_group.size(); ++call_id) { LOG_DEBUG("Handle function call..."); LOG_BLOCK(); + auto call = model_group[call_id]; + auto subg_ref = func.refs[call_id]; + std::unordered_set this_model_nodes; for (auto&& node_ptr : call->get_ordered_ops()) { this_model_nodes.insert(node_ptr.get()); @@ -1348,7 +1378,7 @@ void Partitioner::matchParameters(const std::string& func_name) { LOG_DEBUG("Find orig parameter for " << node); auto& orig_param = proto_parameters.at(pkey); auto this_param = std::dynamic_pointer_cast(node); - func.param_call_to_proto[this_param] = orig_param; + func.param_call_to_proto[SubgParam(subg_ref, this_param)] = orig_param; } } } @@ -1386,14 +1416,16 @@ void Partitioner::matchResults(const std::string& func_name) { // Now walk all submodels and match parameters with the same key // (yes, including the first one) - for (auto&& call : model_group) { + for (std::size_t call_idx = 0; call_idx < model_group.size(); ++call_idx) { + auto call = model_group[call_idx]; + auto subg_ref = func.refs[call_idx]; for (auto&& node : call->get_ordered_ops()) { if (ov::op::util::is_output(node)) { auto&& port = node->input(0).get_source_output(); RKey rkey = {layer_to_prototype.at(port.get_node()->get_friendly_name()), port.get_index()}; auto& orig_result = proto_results.at(rkey); auto this_result = std::dynamic_pointer_cast(node); - func.result_call_to_proto[this_result] = orig_result; + func.result_call_to_proto[SubgResult(subg_ref, this_result)] = orig_result; } } } @@ -1517,8 +1549,8 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) { funcall._gflops = this_sg._gflops; // duplicated code again! funcall._ops = this_sg._ops; // duplicated code again! funcall._avoid_list = this_sg._avoid_list; // duplicated code again! - rearrange_to_function_protocol(body_params, funcall._parameters, func_ggg.param_call_to_proto); - rearrange_to_function_protocol(body_results, funcall._results, func_ggg.result_call_to_proto); + rearrange_to_function_protocol(this_sg, body_params, funcall._parameters, func_ggg.param_call_to_proto); + rearrange_to_function_protocol(this_sg, body_results, funcall._results, func_ggg.result_call_to_proto); auto func_iter = P.functions.find(func_name); NPUW_ASSERT(func_iter != P.functions.end()); @@ -1883,7 +1915,7 @@ void Partitioner::finalizeLinks() { auto& params = P.functions.at(sg_desc._funcall)._model->get_parameters(); auto& proto = func_pipeline_type == FunctionPipelineType::CWAI ? ptr // no protos in the CWAI case.. - : all_functions.at(sg_desc._funcall).param_call_to_proto.at(ptr); + : all_functions.at(sg_desc._funcall).param_call_to_proto.at(SubgParam(sg_desc, ptr)); auto param_iter = std::find(params.begin(), params.end(), proto); NPUW_ASSERT(param_iter != params.end()); return std::distance(params.begin(), param_iter); @@ -1904,7 +1936,7 @@ void Partitioner::finalizeLinks() { auto& results = P.functions.at(sg_desc._funcall)._model->get_results(); auto& proto = func_pipeline_type == FunctionPipelineType::CWAI ? ptr // no protos in the CWAI case... - : all_functions.at(sg_desc._funcall).result_call_to_proto.at(ptr); + : all_functions.at(sg_desc._funcall).result_call_to_proto.at(SubgResult(sg_desc, ptr)); auto result_iter = std::find(results.begin(), results.end(), proto); NPUW_ASSERT(result_iter != results.end()); return std::distance(results.begin(), result_iter); From cddcfe83c6f77cb48443907180dea97c22f6d625 Mon Sep 17 00:00:00 2001 From: Andrei Beleiu Date: Wed, 2 Oct 2024 20:55:21 +0300 Subject: [PATCH 3/3] single-image-test: added accuracy workaround for computing CPU reference and adjust the NRMSE and PSNR metrics (#26020) ### Details: Small improvements for SIT: - Disable [dynamic quantization feature](https://github.com/openvinotoolkit/openvino/blob/master/src/inference/include/openvino/runtime/properties.hpp#L574), which is used by default by CPU pipeline to generate the reference outputs and affects accuracy for some particular models - Increase number of decimals for reported NRMSE metric to improve - For PSNR metric report fails only when the accuracy decrease not when it increase above the target ### Tickets: - *[CVS-143420](https://jira.devtools.intel.com/browse/CVS-143420)* --- src/plugins/intel_npu/tools/single-image-test/main.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_npu/tools/single-image-test/main.cpp b/src/plugins/intel_npu/tools/single-image-test/main.cpp index 3b3009bb5f459c..4018982b022ed3 100644 --- a/src/plugins/intel_npu/tools/single-image-test/main.cpp +++ b/src/plugins/intel_npu/tools/single-image-test/main.cpp @@ -1200,7 +1200,8 @@ bool computeRRMSE(const ov::Tensor& output, const ov::Tensor& reference) { double rrmseLoss = sqrt(error / sum); - std::cout << "RRMSE loss : " << rrmseLoss << " RRMSE threshold : " << FLAGS_rrmse_loss_threshold << std::endl; + std::cout << "RRMSE loss : " << std::fixed << std::setprecision(4) << rrmseLoss + << " RRMSE threshold : " << FLAGS_rrmse_loss_threshold << std::endl; return rrmseLoss <= FLAGS_rrmse_loss_threshold; } @@ -1267,7 +1268,8 @@ bool computeNRMSE(const ov::Tensor& output, const ov::Tensor& reference) { double nrmseLoss = sqrt(error / size) / std::max(0.001f, std::max(maxOutput - minOutput, maxReference - minReference)); - std::cout << "NRMSE loss : " << nrmseLoss << " NRMSE threshold : " << FLAGS_nrmse_loss_threshold << std::endl; + std::cout << "NRMSE loss : " << std::fixed << std::setprecision(4) << nrmseLoss + << " NRMSE threshold : " << FLAGS_nrmse_loss_threshold << std::endl; return nrmseLoss <= FLAGS_nrmse_loss_threshold; } @@ -1319,7 +1321,7 @@ bool testPSNR(const TensorMap& outputs, const TensorMap& references, const int d auto result = utils::runPSNRMetric(actOutput, refOutput, dstHeight, dstWidth, scaleBorder, normalizedImage); - if (std::fabs(result - FLAGS_psnr_reference) > FLAGS_psnr_tolerance) { + if (FLAGS_psnr_reference - result > FLAGS_psnr_tolerance) { std::cout << "Absolute difference between actual value " << result << " and reference value " << FLAGS_psnr_reference << " larger then tolerance " << FLAGS_psnr_tolerance << std::endl; return false;