From f47c9f1a25831fbc02517b3ba0a0fa1400a1eac0 Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Thu, 3 Oct 2024 14:30:25 +0000
Subject: [PATCH] Speed up applying transformations

---
 .../src/plugin/npuw/compiled_model.cpp        | 27 +++++++++++++++++--
 .../src/plugin/npuw/weights_bank.cpp          | 15 +++++++++--
 .../src/plugin/npuw/weights_bank.hpp          |  3 ++-
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 29f2633b21aca2..9cad96154112c6 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -413,7 +413,10 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
 }
 
 void ov::npuw::CompiledModel::finalize_weights_bank() {
+    // Evaluate lazy tensors which aren't in the bank beforehand
+    std::vector<std::vector<ov::Tensor>> evaluated_tensors(m_compiled_submodels.size());
     ov::parallel_for(m_compiled_submodels.size(), [&](std::size_t idx) {
+        evaluated_tensors[idx].resize(m_compiled_submodels[idx].lazy_closure.size());
         auto& comp_model_desc = m_compiled_submodels[idx];
 
         if (!comp_model_desc.replaced_by) {
@@ -423,17 +426,37 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
         const auto real_idx = comp_model_desc.replaced_by.value();
         auto& func_desc = m_compiled_submodels[real_idx];
 
+        for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) {
+            const auto& lt = m_compiled_submodels[idx].lazy_closure[tidx];
+            if (m_weights_bank->has(lt, *func_desc.device_it)) {
+                continue;
+            }
+            evaluated_tensors[idx][tidx] = lt.eval();
+        }
+    });
+
+    for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
+        auto& comp_model_desc = m_compiled_submodels[idx];
+
+        if (!comp_model_desc.replaced_by) {
+            continue;
+        }
+
+        const auto real_idx = comp_model_desc.replaced_by.value();
+        auto& func_desc = m_compiled_submodels[real_idx];
+
         // Due to concat some tensor should be skipped in closure
         m_compiled_submodels[idx].closure.resize(0);
         m_compiled_submodels[idx].is_remote.resize(0);
 
         for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) {
             const auto& lt = m_compiled_submodels[idx].lazy_closure[tidx];
-            m_compiled_submodels[idx].closure.push_back(m_weights_bank->get(lt, *func_desc.device_it));
+            const auto& evaled = evaluated_tensors[idx][tidx];
+            m_compiled_submodels[idx].closure.push_back(m_weights_bank->get(lt, *func_desc.device_it, evaled));
             // FIXME: should is_remote be set unconditionally?
             m_compiled_submodels[idx].is_remote.push_back(true);
         }
-    });
+    }
 }
 
 void ov::npuw::CompiledModel::remove_long_output_names(const std::shared_ptr<ov::Model>& model) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
index 0beb8cf9e40aad..73005b6c41ccab 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
@@ -34,7 +34,7 @@ class BankManager {
     std::mutex m_mutex;
 };
 
-ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) {
+ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device, const ov::Tensor& evaled) {
     if (device != "CPU" && device != "NPU") {
         OPENVINO_THROW("Unsupported device in weights bank allocation: ", device);
     }
@@ -45,7 +45,7 @@ ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) {
     auto& device_bank = m_device_bank[device];
     auto iter_device = device_bank.find(tensor);
     if (iter_device == device_bank.end()) {
-        ov::Tensor transformed_tensor = tensor.eval();
+        ov::Tensor transformed_tensor = evaled ? evaled : tensor.eval();
 
         if (device == "CPU" || m_alloc_device != device) {
             // No allocation - store as is
@@ -66,6 +66,17 @@ ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) {
     return iter_device->second;
 }
 
+bool Bank::has(const LazyTensor& tensor, const std::string& device) {
+    if (device != "CPU" && device != "NPU") {
+        OPENVINO_THROW("Unsupported device in weights bank allocation: ", device);
+    }
+
+    std::lock_guard<std::mutex> guard(m_mutex);
+
+    const auto& device_bank = m_device_bank[device];
+    return device_bank.find(tensor) != device_bank.end();
+}
+
 std::shared_ptr<Bank> BankManager::getBank(const std::string& bank_name,
                                            const std::shared_ptr<const ov::ICore>& core,
                                            const std::string& alloc_device) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp
index 80c368a3095bd4..f4b89cff720d61 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp
@@ -27,7 +27,8 @@ class Bank {
           m_alloc_device(alloc_device) {}
 
     // Based on previously captured lazy tensor allocate a new tensor (if needed) on a specified device
-    ov::Tensor get(const LazyTensor& tensor, const std::string& device);
+    ov::Tensor get(const LazyTensor& tensor, const std::string& device, const ov::Tensor& evaled = ov::Tensor());
+    bool has(const LazyTensor& tensor, const std::string& device);
 
 private:
     // Bank for specified device and their allocated memory