Address review comments part 1

openvinotoolkit · Oct 2, 2024 · 1dc99f2 · 1dc99f2
1 parent 6b02839
commit 1dc99f2
Show file tree

Hide file tree

Showing 11 changed files with 112 additions and 201 deletions.
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
@@ -49,7 +49,7 @@ DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale,
 DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
 DEFINE_OPT(NPUW_PARALLEL_COMPILE, bool, false, npuw::parallel_compilation, CompileTime);
 DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
-DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, bool, false, npuw::weights_bank_alloc, CompileTime);
+DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
 DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
 DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);

diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
@@ -48,10 +48,10 @@ static constexpr ov::Property<std::string> weights_bank{"NPUW_WEIGHTS_BANK"};
 /**
  * @brief
  * Type: std::string.
- * Specify if weights bank is allowed to allocate NPU memory.
+ * Specify device name for weights bank which is used to allocate memory.
  * Default value: false.
  */
-static constexpr ov::Property<bool> weights_bank_alloc{"NPUW_WEIGHTS_BANK_ALLOC"};
+static constexpr ov::Property<std::string> weights_bank_alloc{"NPUW_WEIGHTS_BANK_ALLOC"};
 
 namespace partitioning {
 namespace online {

diff --git a/src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp b/src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp
@@ -88,8 +88,7 @@ void ZeroRemoteTensor::allocate(const size_t bytes) {
             ze_host_mem_alloc_flag_t flag = ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED;
             desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, static_cast<ze_host_mem_alloc_flags_t>(flag)};
         } else {
-            ze_host_mem_alloc_flag_t flag = ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED;
-            desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, static_cast<ze_host_mem_alloc_flags_t>(flag)};
+            desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, 0};
         }
         zeroUtils::throwOnFail("zeMemAllocHost",
                                zeMemAllocHost(_init_structs->getContext(), &desc, size, STANDARD_PAGE_SIZE, &_data));

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -115,7 +115,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
 
     // Initialize weights bank
     const std::string weights_bank_opt = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK>();
-    bool wbank_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
+    const std::string wbank_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
     m_weights_bank = ov::npuw::weights::bank(weights_bank_opt, plugin->get_core(), wbank_alloc);
 
     LOG_VERB("*** Original model ***");
@@ -235,6 +235,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
     }  // for(ordered_subgraphs)
     // NOTE(dm): there's a better way to do it, like we do in G-API backends.
 
+    m_update_required = m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false;
+
     // Store mapping between manually splitted inputs/outputs
     // to connect tensors between compiled submodels
     m_submodels_input_to_prev_output = partitioning.input_to_prev_output;
@@ -289,8 +291,6 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
             m_compiled_submodels[id].transformations = subgraph._transformations;
             m_compiled_submodels[id].scales = subgraph._scales;
             m_compiled_submodels[id].zerops = subgraph._zerops;
-            m_compiled_submodels[id].update_required.resize(subgraph._closure.size(),
-                                                            m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false);
             m_compiled_submodels[id].is_remote.resize(subgraph._closure.size(), false);
         }  // if(!funcall)
 
@@ -428,28 +428,10 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
 
         for (std::size_t tidx = 0; tidx < comp_model_desc.transformations.size(); ++tidx) {
             const auto& lt = m_compiled_submodels[idx].transformations[tidx];
-
-            // FIXME: probably should be more careful with the devices here
-            if (!m_weights_bank->has(lt, *func_desc.device_it)) {
-                ov::Tensor evaled = lt.eval();
-                if (lt.has_concat()) {
-                    // FIXME: probably setting just front() LazyTensor here is enough
-                    for (const auto& lt_to_concat : lt.get_lt_to_concat()) {
-                        // Note: this also includes this LazyTensor's original ov::Tensor
-                        m_weights_bank->store(lt_to_concat, evaled, *func_desc.device_it);
-                    }
-                } else {
-                    m_weights_bank->store(lt, evaled, *func_desc.device_it);
-                }
-            }
-
             m_compiled_submodels[idx].closure.push_back(m_weights_bank->get(lt, *func_desc.device_it));
+            // FIXME: should is_remote be set unconditionally?
             m_compiled_submodels[idx].is_remote.push_back(true);
         }
-
-        // After concat size might have changed
-        m_compiled_submodels[idx].update_required.resize(m_compiled_submodels[idx].closure.size(),
-                                                         m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false);
     });
 }
 

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -125,7 +125,6 @@ class CompiledModel : public ov::ICompiledModel {
         std::vector<weights::LazyTensor> transformations;
         std::vector<ov::Tensor> scales;
         std::vector<ov::Tensor> zerops;
-        std::vector<bool> update_required;
         std::vector<bool> is_remote;
 
         // FIXME: Take it out of structure
@@ -137,6 +136,8 @@ class CompiledModel : public ov::ICompiledModel {
     };
     std::vector<CompiledModelDesc> m_compiled_submodels;
 
+    bool m_update_required;
+
     std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;
     std::string m_ref_device;
 

diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -192,7 +192,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
 
             // No update required to this tensor in runtime - so it can be set only once
             // Will be utilized when there is no FOLDing
-            if (!comp_model_desc.update_required[cidx]) {
+            if (!m_npuw_model->m_update_required) {
                 // At this point closure already contains allocated and transformed tensor ready to be used
                 request->set_tensor(iport, ov::get_tensor_impl(closure));
             }
@@ -498,7 +498,7 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request)
         if (closure.get_element_type() != clparam->get_element_type()) {
             // Remember where the unpack is required
             closure_unpack_required.push_back(cidx);
-        } else if (comp_model_desc.update_required[cidx]) {
+        } else if (m_npuw_model->m_update_required) {
             if (needs_copy(idx, cidx)) {
                 // Remember where copy is requried
                 closure_copy_required.push_back(cidx);

diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp
@@ -5,6 +5,8 @@
 #include "lazy_tensor.hpp"
 
 using ov::npuw::weights::ConcatMeta;
+using ov::npuw::weights::ConstPtr;
+using ov::npuw::weights::LTData;
 using ov::npuw::weights::LazyTensor;
 using ov::npuw::weights::Transform;
 using ov::npuw::weights::TransformType;
@@ -21,22 +23,33 @@ std::size_t LazyTensor::Hash::operator()(const LazyTensor& lt) const {
                 seed ^= std::hash<std::size_t>()(axis) + 0x9e3779b9;
             }
         } else if (tr.first == TransformType::CONCAT) {
-            // concat tag can be different, no need to hash it
             const auto& axis = std::get<ConcatMeta>(tr.second).second;
             seed ^= std::hash<std::size_t>()(axis) + 0x9e3779b9;
+            for (const auto& lt : std::get<ConcatMeta>(tr.second).first) {
+                seed ^= LazyTensor::Hash::operator()(lt) + 0x9e3779b9;
+            }
         }
     }
     return seed;
 }
 
 LazyTensor::LazyTensor(const TransformType& type, const Transform& transform) {
-    // Sanity check
-    NPUW_ASSERT(type == TransformType::TENSOR && std::holds_alternative<ov::Tensor>(transform));
-    m_transforms.push_back({type, transform});
-    const auto& tensor = std::get<ov::Tensor>(transform);
-    m_orig_data = tensor.data();
-    m_orig_shape = tensor.get_shape();
-    m_orig_type = tensor.get_element_type();
+    if (type == TransformType::TENSOR && std::holds_alternative<LTData>(transform)) {
+        m_transforms.push_back({type, transform});
+        ov::Tensor tensor;
+        if (std::holds_alternative<ConstPtr>(std::get<LTData>(transform))){
+            tensor = ov::npuw::util::tensor_from_const(std::get<ConstPtr>(std::get<LTData>(transform)));
+        } else {
+            tensor = std::get<ov::Tensor>(std::get<LTData>(transform));
+        }
+        m_orig_data = tensor.data();
+        m_orig_shape = tensor.get_shape();
+        m_orig_type = tensor.get_element_type();
+    } else if (type == TransformType::CONCAT && std::holds_alternative<ConcatMeta>(transform)) {
+        m_transforms.push_back({type, transform});
+    } else {
+        NPUW_ASSERT(false);
+    }
 }
 
 bool LazyTensor::operator==(const LazyTensor& other) const {
@@ -56,11 +69,19 @@ bool LazyTensor::operator==(const LazyTensor& other) const {
                 return false;
             }
         } else if (m_transforms[i].first == TransformType::CONCAT) {
-            // concat tag can be different, no need to compare it
-            if (std::get<ConcatMeta>(m_transforms[i].second).second !=
-                std::get<ConcatMeta>(other.m_transforms[i].second).second) {
+            const auto& m1 = std::get<ConcatMeta>(m_transforms[i].second);
+            const auto& m2 = std::get<ConcatMeta>(other.m_transforms[i].second);
+            if (m1.second != m2.second) {
+                return false;
+            }
+            if (m1.first.size() != m2.first.size()) {
                 return false;
             }
+            for (std::size_t mi = 0; mi < m1.first.size(); ++mi) {
+                if (!(m1.first[mi] == m2.first[mi])) {
+                    return false;
+                }
+            }
         }
     }
 
@@ -70,8 +91,7 @@ bool LazyTensor::operator==(const LazyTensor& other) const {
 void LazyTensor::update(const TransformType& type, const Transform& transform) {
     // Sanity check
     NPUW_ASSERT((type == TransformType::PERMUTE && std::holds_alternative<std::vector<std::size_t>>(transform)) ||
-                (type == TransformType::CONVERT && std::holds_alternative<std::monostate>(transform)) ||
-                (type == TransformType::CONCAT && std::holds_alternative<ConcatMeta>(transform)));
+                (type == TransformType::CONVERT && std::holds_alternative<std::monostate>(transform)));
     m_transforms.push_back({type, transform});
 }
 
@@ -86,76 +106,56 @@ ov::Tensor LazyTensor::eval() const {
     Perhaps it should be done after model compilation and not handled here.
     */
 
-    // Sanity check
-    NPUW_ASSERT(std::holds_alternative<ov::Tensor>(m_transforms.front().second));
-
-    ov::Tensor transformed = get_orig_tensor();
+    ov::Tensor transformed;
     ov::Tensor tnew;
-    for (auto& tr : m_transforms) {
+
+    NPUW_ASSERT(!m_transforms.empty());
+
+    // Process the initial tensor - either from Const or from Concat
+    if (m_transforms.front().first == TransformType::TENSOR) {
+        transformed = get_orig_tensor();
+    } else if (m_transforms.front().first == TransformType::CONCAT) {
+        std::vector<ov::Tensor> to_concat;
+        for (const auto& lt : std::get<ConcatMeta>(m_transforms.front().second).first) {
+            // Sanity check
+            NPUW_ASSERT(!lt.has_transformations());
+            to_concat.push_back(lt.get_orig_tensor());
+        }
+        transformed = ov::npuw::util::concat(to_concat, std::get<ConcatMeta>(m_transforms.front().second).second);
+    } else {
+        NPUW_ASSERT(false);
+    }
+
+    // Process transformation on top of initial tensor
+    for (std::size_t i = 1; i < m_transforms.size(); ++i) {
+        const auto& tr = m_transforms[i];
         switch (tr.first) {
-        case TransformType::TENSOR:
-            continue;
         case TransformType::PERMUTE:
             tnew = ov::npuw::util::permute(transformed, std::get<std::vector<std::size_t>>(tr.second));
             tnew.copy_to(transformed);
         case TransformType::CONVERT:
             tnew = ov::npuw::util::to_f16(transformed);
             tnew.copy_to(transformed);
-        case TransformType::CONCAT:
-            tnew = ov::npuw::util::concat(get_to_concat(), std::get<ConcatMeta>(tr.second).second);
-            tnew.copy_to(transformed);
         default:
             NPUW_ASSERT(false);
         }
     }
-
     return transformed;
 }
 
-void* LazyTensor::get_orig_data() const {
-    return m_orig_data;
-}
-
 ov::Tensor LazyTensor::get_orig_tensor() const {
     // Sanity check
-    NPUW_ASSERT(std::holds_alternative<ov::Tensor>(m_transforms.front().second));
-    return std::get<ov::Tensor>(m_transforms.front().second);
-}
-
-bool LazyTensor::has_concat() const {
-    for (auto& tr : m_transforms) {
-        if (tr.first == TransformType::CONCAT) {
-            return true;
-        }
+    NPUW_ASSERT(!has_transformations());
+    if (std::holds_alternative<ConstPtr>(std::get<LTData>(m_transforms.front().second))){
+        return ov::npuw::util::tensor_from_const(std::get<ConstPtr>(std::get<LTData>(m_transforms.front().second)));
     }
-    return false;
+    return std::get<ov::Tensor>(std::get<LTData>(m_transforms.front().second));
 }
 
 bool LazyTensor::has_transformations() const {
-    // The first transformation is always initial Tensor
-    return m_transforms.size() > 1;
-}
-
-std::vector<ov::Tensor> LazyTensor::get_to_concat() const {
-    NPUW_ASSERT(has_concat());
-    std::vector<ov::Tensor> to_concat;
-    for (auto& tr : m_transforms) {
-        if (tr.first == TransformType::CONCAT) {
-            for (const auto& lt : std::get<ConcatMeta>(tr.second).first) {
-                to_concat.push_back(lt.get_orig_tensor());
-            }
-        }
-    }
-    return to_concat;
-}
-
-std::vector<LazyTensor> LazyTensor::get_lt_to_concat() const {
-    NPUW_ASSERT(has_concat());
-    for (auto& tr : m_transforms) {
-        if (tr.first == TransformType::CONCAT) {
-            return std::get<ConcatMeta>(tr.second).first;
-        }
+    // The first transformation is always initial Tensor or Concat
+    if (m_transforms.size() == 1 && m_transforms.front().first == TransformType::TENSOR) {
+        return false;
     }
-    NPUW_ASSERT(false);
-    return {};
+    return true;
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp
@@ -25,8 +25,11 @@ enum class TransformType : int { TENSOR, PERMUTE, CONVERT, CONCAT };
 class LazyTensor;
 
 using ConcatMeta = std::pair<std::vector<LazyTensor>, std::size_t>;
+using ConstPtr = std::shared_ptr<ov::op::v0::Constant>;
+using LTData = std::variant<ConstPtr, ov::Tensor>;
 
-using Transform = std::variant<ov::Tensor, std::vector<std::size_t>, std::monostate, ConcatMeta>;
+// LazyTensor owns Constant's memory
+using Transform = std::variant<LTData, std::vector<std::size_t>, std::monostate, ConcatMeta>;
 
 class LazyTensor {
 public:
@@ -43,17 +46,13 @@ class LazyTensor {
     void update(const TransformType& type, const Transform& transform);
     ov::Tensor eval() const;
 
-    void* get_orig_data() const;
     ov::Tensor get_orig_tensor() const;
 
-    bool has_concat() const;
     bool has_transformations() const;
-    std::vector<ov::Tensor> get_to_concat() const;
-    std::vector<LazyTensor> get_lt_to_concat() const;
 
 private:
     std::vector<std::pair<TransformType, Transform>> m_transforms;
-    void* m_orig_data;
+    void* m_orig_data = nullptr;
     ov::Shape m_orig_shape;
     ov::element::Type m_orig_type;
 };

diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -1461,8 +1461,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) {
 
                 LOG_DEBUG("Register " << prod_output << " in the function closure");
                 funcall._transformations.push_back(LazyTensor(
-                    TransformType::TENSOR,
-                    bank->update(std::dynamic_pointer_cast<ov::op::v0::Constant>(input_node))));  // (n)/1/i/c
+                    TransformType::TENSOR, std::dynamic_pointer_cast<ov::op::v0::Constant>(input_node)));  // (n)/1/i/c
             } else if (ov::op::util::is_parameter(input_node)) {
                 LOG_DEBUG("Handling a Parameter input " << prod_output);
                 LOG_BLOCK();
@@ -1559,8 +1558,7 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) {
                     LOG_DEBUG("Register " << prod_output << " in the function closure[" << param_idx
                                           << "] (via prototype " << proto_layer_name << ")");
                     funcall._transformations[param_idx - function._param_offset] = LazyTensor(
-                        TransformType::TENSOR,
-                        bank->update(std::dynamic_pointer_cast<ov::op::v0::Constant>(input_node)));  // (t)/1/c
+                        TransformType::TENSOR, std::dynamic_pointer_cast<ov::op::v0::Constant>(input_node));  // (t)/1/c
                 }
             }  // for (inputs)
         }      // for(nodes)
@@ -1650,18 +1648,15 @@ void Partitioner::optimize(const std::string& func_name) {
                 auto& funcall = func_group.refs[f_idx].get();
                 std::vector<LazyTensor> to_concat;
                 // Fill tensor vector
-                for (auto&& cidx : to_concat_idx) {
-                    to_concat.push_back(funcall._transformations[cidx]);
-                }
-                // Set to lazy tensor history
                 for (auto&& cidx : to_concat_idx) {
                     // FIXME: Assuming here concat goes first and other transformations later.
                     //        This allows to store ov::Tensor and ignore their potential history of transformations
-                    funcall._transformations[cidx].update(TransformType::CONCAT, std::make_pair(to_concat, axis));
+                    NPUW_ASSERT(!funcall._transformations[cidx].has_transformations());
+                    to_concat.push_back(funcall._transformations[cidx]);
                 }
-                // Pick the first (could be any) LazyTensor and set as new future-concatenated tensor
+                // Note: we can ignore updating funcall._transformations[cidx] here since those LazyTensors will be gone and the new one added into the vector
                 if (!to_concat.empty()) {
-                    funcall._transformations.push_back(to_concat.front());
+                    funcall._transformations.push_back(LazyTensor(TransformType::CONCAT, std::make_pair(to_concat, axis)));
                 }
             });
         }