From f47c9f1a25831fbc02517b3ba0a0fa1400a1eac0 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Thu, 3 Oct 2024 14:30:25 +0000 Subject: [PATCH] Speed up applying transformations --- .../src/plugin/npuw/compiled_model.cpp | 27 +++++++++++++++++-- .../src/plugin/npuw/weights_bank.cpp | 15 +++++++++-- .../src/plugin/npuw/weights_bank.hpp | 3 ++- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 29f2633b21aca2..9cad96154112c6 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -413,7 +413,10 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, } void ov::npuw::CompiledModel::finalize_weights_bank() { + // Evaluate lazy tensors which aren't in the bank beforehand + std::vector> evaluated_tensors(m_compiled_submodels.size()); ov::parallel_for(m_compiled_submodels.size(), [&](std::size_t idx) { + evaluated_tensors[idx].resize(m_compiled_submodels[idx].lazy_closure.size()); auto& comp_model_desc = m_compiled_submodels[idx]; if (!comp_model_desc.replaced_by) { @@ -423,17 +426,37 @@ void ov::npuw::CompiledModel::finalize_weights_bank() { const auto real_idx = comp_model_desc.replaced_by.value(); auto& func_desc = m_compiled_submodels[real_idx]; + for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) { + const auto& lt = m_compiled_submodels[idx].lazy_closure[tidx]; + if (m_weights_bank->has(lt, *func_desc.device_it)) { + continue; + } + evaluated_tensors[idx][tidx] = lt.eval(); + } + }); + + for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) { + auto& comp_model_desc = m_compiled_submodels[idx]; + + if (!comp_model_desc.replaced_by) { + continue; + } + + const auto real_idx = comp_model_desc.replaced_by.value(); + auto& func_desc = m_compiled_submodels[real_idx]; + // Due to concat some tensor should be skipped in closure m_compiled_submodels[idx].closure.resize(0); m_compiled_submodels[idx].is_remote.resize(0); for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) { const auto& lt = m_compiled_submodels[idx].lazy_closure[tidx]; - m_compiled_submodels[idx].closure.push_back(m_weights_bank->get(lt, *func_desc.device_it)); + const auto& evaled = evaluated_tensors[idx][tidx]; + m_compiled_submodels[idx].closure.push_back(m_weights_bank->get(lt, *func_desc.device_it, evaled)); // FIXME: should is_remote be set unconditionally? m_compiled_submodels[idx].is_remote.push_back(true); } - }); + } } void ov::npuw::CompiledModel::remove_long_output_names(const std::shared_ptr& model) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp index 0beb8cf9e40aad..73005b6c41ccab 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp @@ -34,7 +34,7 @@ class BankManager { std::mutex m_mutex; }; -ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) { +ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device, const ov::Tensor& evaled) { if (device != "CPU" && device != "NPU") { OPENVINO_THROW("Unsupported device in weights bank allocation: ", device); } @@ -45,7 +45,7 @@ ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) { auto& device_bank = m_device_bank[device]; auto iter_device = device_bank.find(tensor); if (iter_device == device_bank.end()) { - ov::Tensor transformed_tensor = tensor.eval(); + ov::Tensor transformed_tensor = evaled ? evaled : tensor.eval(); if (device == "CPU" || m_alloc_device != device) { // No allocation - store as is @@ -66,6 +66,17 @@ ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) { return iter_device->second; } +bool Bank::has(const LazyTensor& tensor, const std::string& device) { + if (device != "CPU" && device != "NPU") { + OPENVINO_THROW("Unsupported device in weights bank allocation: ", device); + } + + std::lock_guard guard(m_mutex); + + const auto& device_bank = m_device_bank[device]; + return device_bank.find(tensor) != device_bank.end(); +} + std::shared_ptr BankManager::getBank(const std::string& bank_name, const std::shared_ptr& core, const std::string& alloc_device) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp index 80c368a3095bd4..f4b89cff720d61 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp @@ -27,7 +27,8 @@ class Bank { m_alloc_device(alloc_device) {} // Based on previously captured lazy tensor allocate a new tensor (if needed) on a specified device - ov::Tensor get(const LazyTensor& tensor, const std::string& device); + ov::Tensor get(const LazyTensor& tensor, const std::string& device, const ov::Tensor& evaled = ov::Tensor()); + bool has(const LazyTensor& tensor, const std::string& device); private: // Bank for specified device and their allocated memory