From 1dc99f232487c2e59592d3ed95c38e6699a10af4 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Wed, 2 Oct 2024 16:27:10 +0000 Subject: [PATCH] Address review comments part 1 --- .../al/include/intel_npu/al/config/npuw.hpp | 2 +- .../al/include/npuw_private_properties.hpp | 4 +- .../src/backend/src/zero_remote_tensor.cpp | 3 +- .../src/plugin/npuw/compiled_model.cpp | 26 +--- .../src/plugin/npuw/compiled_model.hpp | 3 +- .../plugin/npuw/just_sync_infer_request.cpp | 4 +- .../intel_npu/src/plugin/npuw/lazy_tensor.cpp | 126 +++++++++--------- .../intel_npu/src/plugin/npuw/lazy_tensor.hpp | 11 +- .../plugin/npuw/partitioning/partitioning.cpp | 17 +-- .../src/plugin/npuw/weights_bank.cpp | 100 +++----------- .../src/plugin/npuw/weights_bank.hpp | 17 +-- 11 files changed, 112 insertions(+), 201 deletions(-) diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp index 147116364e6f36..e73874cd4bd57a 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp @@ -49,7 +49,7 @@ DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime); DEFINE_OPT(NPUW_PARALLEL_COMPILE, bool, false, npuw::parallel_compilation, CompileTime); DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime); -DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, bool, false, npuw::weights_bank_alloc, CompileTime); +DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime); DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime); DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime); DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime); diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp index 3b794e72f0ddbd..b83c7518df389e 100644 --- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp @@ -48,10 +48,10 @@ static constexpr ov::Property weights_bank{"NPUW_WEIGHTS_BANK"}; /** * @brief * Type: std::string. - * Specify if weights bank is allowed to allocate NPU memory. + * Specify device name for weights bank which is used to allocate memory. * Default value: false. */ -static constexpr ov::Property weights_bank_alloc{"NPUW_WEIGHTS_BANK_ALLOC"}; +static constexpr ov::Property weights_bank_alloc{"NPUW_WEIGHTS_BANK_ALLOC"}; namespace partitioning { namespace online { diff --git a/src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp b/src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp index 19590e76f45883..4ac1d75fe57f10 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp @@ -88,8 +88,7 @@ void ZeroRemoteTensor::allocate(const size_t bytes) { ze_host_mem_alloc_flag_t flag = ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED; desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, static_cast(flag)}; } else { - ze_host_mem_alloc_flag_t flag = ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED; - desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, static_cast(flag)}; + desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, 0}; } zeroUtils::throwOnFail("zeMemAllocHost", zeMemAllocHost(_init_structs->getContext(), &desc, size, STANDARD_PAGE_SIZE, &_data)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index ba804fee44dda6..73345113e89b2d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -115,7 +115,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, // Initialize weights bank const std::string weights_bank_opt = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK>(); - bool wbank_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>(); + const std::string wbank_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>(); m_weights_bank = ov::npuw::weights::bank(weights_bank_opt, plugin->get_core(), wbank_alloc); LOG_VERB("*** Original model ***"); @@ -235,6 +235,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, } // for(ordered_subgraphs) // NOTE(dm): there's a better way to do it, like we do in G-API backends. + m_update_required = m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false; + // Store mapping between manually splitted inputs/outputs // to connect tensors between compiled submodels m_submodels_input_to_prev_output = partitioning.input_to_prev_output; @@ -289,8 +291,6 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, m_compiled_submodels[id].transformations = subgraph._transformations; m_compiled_submodels[id].scales = subgraph._scales; m_compiled_submodels[id].zerops = subgraph._zerops; - m_compiled_submodels[id].update_required.resize(subgraph._closure.size(), - m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false); m_compiled_submodels[id].is_remote.resize(subgraph._closure.size(), false); } // if(!funcall) @@ -428,28 +428,10 @@ void ov::npuw::CompiledModel::finalize_weights_bank() { for (std::size_t tidx = 0; tidx < comp_model_desc.transformations.size(); ++tidx) { const auto& lt = m_compiled_submodels[idx].transformations[tidx]; - - // FIXME: probably should be more careful with the devices here - if (!m_weights_bank->has(lt, *func_desc.device_it)) { - ov::Tensor evaled = lt.eval(); - if (lt.has_concat()) { - // FIXME: probably setting just front() LazyTensor here is enough - for (const auto& lt_to_concat : lt.get_lt_to_concat()) { - // Note: this also includes this LazyTensor's original ov::Tensor - m_weights_bank->store(lt_to_concat, evaled, *func_desc.device_it); - } - } else { - m_weights_bank->store(lt, evaled, *func_desc.device_it); - } - } - m_compiled_submodels[idx].closure.push_back(m_weights_bank->get(lt, *func_desc.device_it)); + // FIXME: should is_remote be set unconditionally? m_compiled_submodels[idx].is_remote.push_back(true); } - - // After concat size might have changed - m_compiled_submodels[idx].update_required.resize(m_compiled_submodels[idx].closure.size(), - m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false); }); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index a9d1a324cd8a32..d565abcf4444c4 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -125,7 +125,6 @@ class CompiledModel : public ov::ICompiledModel { std::vector transformations; std::vector scales; std::vector zerops; - std::vector update_required; std::vector is_remote; // FIXME: Take it out of structure @@ -137,6 +136,8 @@ class CompiledModel : public ov::ICompiledModel { }; std::vector m_compiled_submodels; + bool m_update_required; + std::function&, const ov::SoPtr&)> m_acc_check; std::string m_ref_device; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 30befaf6de0b9d..b609bb3e1e333a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -192,7 +192,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_update_required) { // At this point closure already contains allocated and transformed tensor ready to be used request->set_tensor(iport, ov::get_tensor_impl(closure)); } @@ -498,7 +498,7 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) if (closure.get_element_type() != clparam->get_element_type()) { // Remember where the unpack is required closure_unpack_required.push_back(cidx); - } else if (comp_model_desc.update_required[cidx]) { + } else if (m_npuw_model->m_update_required) { if (needs_copy(idx, cidx)) { // Remember where copy is requried closure_copy_required.push_back(cidx); diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp index ca87d43fdd85a9..9498cae7f75457 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp @@ -5,6 +5,8 @@ #include "lazy_tensor.hpp" using ov::npuw::weights::ConcatMeta; +using ov::npuw::weights::ConstPtr; +using ov::npuw::weights::LTData; using ov::npuw::weights::LazyTensor; using ov::npuw::weights::Transform; using ov::npuw::weights::TransformType; @@ -21,22 +23,33 @@ std::size_t LazyTensor::Hash::operator()(const LazyTensor& lt) const { seed ^= std::hash()(axis) + 0x9e3779b9; } } else if (tr.first == TransformType::CONCAT) { - // concat tag can be different, no need to hash it const auto& axis = std::get(tr.second).second; seed ^= std::hash()(axis) + 0x9e3779b9; + for (const auto& lt : std::get(tr.second).first) { + seed ^= LazyTensor::Hash::operator()(lt) + 0x9e3779b9; + } } } return seed; } LazyTensor::LazyTensor(const TransformType& type, const Transform& transform) { - // Sanity check - NPUW_ASSERT(type == TransformType::TENSOR && std::holds_alternative(transform)); - m_transforms.push_back({type, transform}); - const auto& tensor = std::get(transform); - m_orig_data = tensor.data(); - m_orig_shape = tensor.get_shape(); - m_orig_type = tensor.get_element_type(); + if (type == TransformType::TENSOR && std::holds_alternative(transform)) { + m_transforms.push_back({type, transform}); + ov::Tensor tensor; + if (std::holds_alternative(std::get(transform))){ + tensor = ov::npuw::util::tensor_from_const(std::get(std::get(transform))); + } else { + tensor = std::get(std::get(transform)); + } + m_orig_data = tensor.data(); + m_orig_shape = tensor.get_shape(); + m_orig_type = tensor.get_element_type(); + } else if (type == TransformType::CONCAT && std::holds_alternative(transform)) { + m_transforms.push_back({type, transform}); + } else { + NPUW_ASSERT(false); + } } bool LazyTensor::operator==(const LazyTensor& other) const { @@ -56,11 +69,19 @@ bool LazyTensor::operator==(const LazyTensor& other) const { return false; } } else if (m_transforms[i].first == TransformType::CONCAT) { - // concat tag can be different, no need to compare it - if (std::get(m_transforms[i].second).second != - std::get(other.m_transforms[i].second).second) { + const auto& m1 = std::get(m_transforms[i].second); + const auto& m2 = std::get(other.m_transforms[i].second); + if (m1.second != m2.second) { + return false; + } + if (m1.first.size() != m2.first.size()) { return false; } + for (std::size_t mi = 0; mi < m1.first.size(); ++mi) { + if (!(m1.first[mi] == m2.first[mi])) { + return false; + } + } } } @@ -70,8 +91,7 @@ bool LazyTensor::operator==(const LazyTensor& other) const { void LazyTensor::update(const TransformType& type, const Transform& transform) { // Sanity check NPUW_ASSERT((type == TransformType::PERMUTE && std::holds_alternative>(transform)) || - (type == TransformType::CONVERT && std::holds_alternative(transform)) || - (type == TransformType::CONCAT && std::holds_alternative(transform))); + (type == TransformType::CONVERT && std::holds_alternative(transform))); m_transforms.push_back({type, transform}); } @@ -86,76 +106,56 @@ ov::Tensor LazyTensor::eval() const { Perhaps it should be done after model compilation and not handled here. */ - // Sanity check - NPUW_ASSERT(std::holds_alternative(m_transforms.front().second)); - - ov::Tensor transformed = get_orig_tensor(); + ov::Tensor transformed; ov::Tensor tnew; - for (auto& tr : m_transforms) { + + NPUW_ASSERT(!m_transforms.empty()); + + // Process the initial tensor - either from Const or from Concat + if (m_transforms.front().first == TransformType::TENSOR) { + transformed = get_orig_tensor(); + } else if (m_transforms.front().first == TransformType::CONCAT) { + std::vector to_concat; + for (const auto& lt : std::get(m_transforms.front().second).first) { + // Sanity check + NPUW_ASSERT(!lt.has_transformations()); + to_concat.push_back(lt.get_orig_tensor()); + } + transformed = ov::npuw::util::concat(to_concat, std::get(m_transforms.front().second).second); + } else { + NPUW_ASSERT(false); + } + + // Process transformation on top of initial tensor + for (std::size_t i = 1; i < m_transforms.size(); ++i) { + const auto& tr = m_transforms[i]; switch (tr.first) { - case TransformType::TENSOR: - continue; case TransformType::PERMUTE: tnew = ov::npuw::util::permute(transformed, std::get>(tr.second)); tnew.copy_to(transformed); case TransformType::CONVERT: tnew = ov::npuw::util::to_f16(transformed); tnew.copy_to(transformed); - case TransformType::CONCAT: - tnew = ov::npuw::util::concat(get_to_concat(), std::get(tr.second).second); - tnew.copy_to(transformed); default: NPUW_ASSERT(false); } } - return transformed; } -void* LazyTensor::get_orig_data() const { - return m_orig_data; -} - ov::Tensor LazyTensor::get_orig_tensor() const { // Sanity check - NPUW_ASSERT(std::holds_alternative(m_transforms.front().second)); - return std::get(m_transforms.front().second); -} - -bool LazyTensor::has_concat() const { - for (auto& tr : m_transforms) { - if (tr.first == TransformType::CONCAT) { - return true; - } + NPUW_ASSERT(!has_transformations()); + if (std::holds_alternative(std::get(m_transforms.front().second))){ + return ov::npuw::util::tensor_from_const(std::get(std::get(m_transforms.front().second))); } - return false; + return std::get(std::get(m_transforms.front().second)); } bool LazyTensor::has_transformations() const { - // The first transformation is always initial Tensor - return m_transforms.size() > 1; -} - -std::vector LazyTensor::get_to_concat() const { - NPUW_ASSERT(has_concat()); - std::vector to_concat; - for (auto& tr : m_transforms) { - if (tr.first == TransformType::CONCAT) { - for (const auto& lt : std::get(tr.second).first) { - to_concat.push_back(lt.get_orig_tensor()); - } - } - } - return to_concat; -} - -std::vector LazyTensor::get_lt_to_concat() const { - NPUW_ASSERT(has_concat()); - for (auto& tr : m_transforms) { - if (tr.first == TransformType::CONCAT) { - return std::get(tr.second).first; - } + // The first transformation is always initial Tensor or Concat + if (m_transforms.size() == 1 && m_transforms.front().first == TransformType::TENSOR) { + return false; } - NPUW_ASSERT(false); - return {}; + return true; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp index 2589b1cb1107ec..fd6ea17e8b1659 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp @@ -25,8 +25,11 @@ enum class TransformType : int { TENSOR, PERMUTE, CONVERT, CONCAT }; class LazyTensor; using ConcatMeta = std::pair, std::size_t>; +using ConstPtr = std::shared_ptr; +using LTData = std::variant; -using Transform = std::variant, std::monostate, ConcatMeta>; +// LazyTensor owns Constant's memory +using Transform = std::variant, std::monostate, ConcatMeta>; class LazyTensor { public: @@ -43,17 +46,13 @@ class LazyTensor { void update(const TransformType& type, const Transform& transform); ov::Tensor eval() const; - void* get_orig_data() const; ov::Tensor get_orig_tensor() const; - bool has_concat() const; bool has_transformations() const; - std::vector get_to_concat() const; - std::vector get_lt_to_concat() const; private: std::vector> m_transforms; - void* m_orig_data; + void* m_orig_data = nullptr; ov::Shape m_orig_shape; ov::element::Type m_orig_type; }; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 912a0f8b77accb..811f7cb5666617 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1461,8 +1461,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) { LOG_DEBUG("Register " << prod_output << " in the function closure"); funcall._transformations.push_back(LazyTensor( - TransformType::TENSOR, - bank->update(std::dynamic_pointer_cast(input_node)))); // (n)/1/i/c + TransformType::TENSOR, std::dynamic_pointer_cast(input_node))); // (n)/1/i/c } else if (ov::op::util::is_parameter(input_node)) { LOG_DEBUG("Handling a Parameter input " << prod_output); LOG_BLOCK(); @@ -1559,8 +1558,7 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) { LOG_DEBUG("Register " << prod_output << " in the function closure[" << param_idx << "] (via prototype " << proto_layer_name << ")"); funcall._transformations[param_idx - function._param_offset] = LazyTensor( - TransformType::TENSOR, - bank->update(std::dynamic_pointer_cast(input_node))); // (t)/1/c + TransformType::TENSOR, std::dynamic_pointer_cast(input_node)); // (t)/1/c } } // for (inputs) } // for(nodes) @@ -1650,18 +1648,15 @@ void Partitioner::optimize(const std::string& func_name) { auto& funcall = func_group.refs[f_idx].get(); std::vector to_concat; // Fill tensor vector - for (auto&& cidx : to_concat_idx) { - to_concat.push_back(funcall._transformations[cidx]); - } - // Set to lazy tensor history for (auto&& cidx : to_concat_idx) { // FIXME: Assuming here concat goes first and other transformations later. // This allows to store ov::Tensor and ignore their potential history of transformations - funcall._transformations[cidx].update(TransformType::CONCAT, std::make_pair(to_concat, axis)); + NPUW_ASSERT(!funcall._transformations[cidx].has_transformations()); + to_concat.push_back(funcall._transformations[cidx]); } - // Pick the first (could be any) LazyTensor and set as new future-concatenated tensor + // Note: we can ignore updating funcall._transformations[cidx] here since those LazyTensors will be gone and the new one added into the vector if (!to_concat.empty()) { - funcall._transformations.push_back(to_concat.front()); + funcall._transformations.push_back(LazyTensor(TransformType::CONCAT, std::make_pair(to_concat, axis))); } }); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp index 8fb7bbafd87ef8..0beb8cf9e40aad 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp @@ -26,7 +26,7 @@ class BankManager { // Public API std::shared_ptr getBank(const std::string& bank_name, const std::shared_ptr& core, - bool alloc_allowed); + const std::string& alloc_device); private: // Data @@ -34,18 +34,6 @@ class BankManager { std::mutex m_mutex; }; -ov::Tensor Bank::update(const std::shared_ptr& node) { - std::lock_guard guard(m_mutex); - - auto tensor = ov::npuw::util::tensor_from_const(node); - - if (m_bank.find(tensor.data()) == m_bank.end()) { - m_bank[tensor.data()] = node; - } - - return tensor; -} - ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) { if (device != "CPU" && device != "NPU") { OPENVINO_THROW("Unsupported device in weights bank allocation: ", device); @@ -53,83 +41,39 @@ ov::Tensor Bank::get(const LazyTensor& tensor, const std::string& device) { std::lock_guard guard(m_mutex); - // Sanity check - auto iter_cpu = m_bank.find(tensor.get_orig_data()); - if (iter_cpu == m_bank.end()) { - OPENVINO_THROW("Unknown tensor in weights bank allocation!"); - } - // Check if already allocated and transformed auto& device_bank = m_device_bank[device]; auto iter_device = device_bank.find(tensor); if (iter_device == device_bank.end()) { - OPENVINO_THROW("There is no allocated/transformed tensor found!"); + ov::Tensor transformed_tensor = tensor.eval(); + + if (device == "CPU" || m_alloc_device != device) { + // No allocation - store as is + device_bank[tensor] = transformed_tensor; + return transformed_tensor; + } + + // Allocation needed + m_remote_ctx = m_core->get_default_context(device)._ptr; + auto remote_tensor = + m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape()); + auto allocated_tensor = ov::make_tensor(remote_tensor); + transformed_tensor.copy_to(allocated_tensor); + device_bank[tensor] = allocated_tensor; + return allocated_tensor; } return iter_device->second; } -void Bank::store(const LazyTensor& tensor, const ov::Tensor& transformed_tensor, const std::string& device) { - if (device != "CPU" && device != "NPU") { - OPENVINO_THROW("Unsupported device in weights bank allocation: ", device); - } - - std::lock_guard guard(m_mutex); - - // Sanity check - auto iter_cpu = m_bank.find(tensor.get_orig_data()); - if (iter_cpu == m_bank.end()) { - OPENVINO_THROW("Unknown tensor in weights bank allocation!"); - } - - // Check if already allocated and transformed - auto& device_bank = m_device_bank[device]; - auto iter_device = device_bank.find(tensor); - if (iter_device != device_bank.end()) { - LOG_WARN("Tensor is already allocated and stored in the bank."); - return; - } - - if (device == "CPU" || !m_alloc_allowed) { - // No allocation needed - store as is - device_bank[tensor] = transformed_tensor; - return; - } - - // Allocation needed - m_remote_ctx = m_core->get_default_context(device)._ptr; - auto remote_tensor = - m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape()); - auto allocated_tensor = ov::make_tensor(remote_tensor); - transformed_tensor.copy_to(allocated_tensor); - device_bank[tensor] = allocated_tensor; -} - -bool Bank::has(const LazyTensor& tensor, const std::string& device) { - if (device != "CPU" && device != "NPU") { - OPENVINO_THROW("Unsupported device in weights bank allocation: ", device); - } - - std::lock_guard guard(m_mutex); - - // Sanity check - auto iter_cpu = m_bank.find(tensor.get_orig_data()); - if (iter_cpu == m_bank.end()) { - OPENVINO_THROW("Unknown tensor in weights bank allocation!"); - } - - const auto& device_bank = m_device_bank[device]; - return device_bank.find(tensor) != device_bank.end(); -} - std::shared_ptr BankManager::getBank(const std::string& bank_name, const std::shared_ptr& core, - bool alloc_allowed) { + const std::string& alloc_device) { std::lock_guard guard(m_mutex); auto iter = m_bank_map.find(bank_name); if (iter == m_bank_map.end()) { - auto bank = std::make_shared(core, alloc_allowed); + auto bank = std::make_shared(core, alloc_device); m_bank_map[bank_name] = bank; return bank; } @@ -138,12 +82,12 @@ std::shared_ptr BankManager::getBank(const std::string& bank_name, std::shared_ptr ov::npuw::weights::bank(const std::string& bank_name, const std::shared_ptr& core, - bool alloc_allowed) { + const std::string& alloc_device) { if (bank_name.empty()) { // Don't share this bank in manager - return std::make_shared(core, alloc_allowed); + return std::make_shared(core, alloc_device); } auto& instance = BankManager::getInstance(); - return instance.getBank(bank_name, core, alloc_allowed); + return instance.getBank(bank_name, core, alloc_device); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp index e54998502fd86c..80c368a3095bd4 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp @@ -22,34 +22,25 @@ namespace weights { class Bank { public: - explicit Bank(const std::shared_ptr& core, bool alloc_allowed) + explicit Bank(const std::shared_ptr& core, const std::string& alloc_device) : m_core(core), - m_alloc_allowed(alloc_allowed) {} - - // Capture CPU version of the tensor - ov::Tensor update(const std::shared_ptr& node); + m_alloc_device(alloc_device) {} // Based on previously captured lazy tensor allocate a new tensor (if needed) on a specified device ov::Tensor get(const LazyTensor& tensor, const std::string& device); - // Store transformed and allocated tensor - void store(const LazyTensor& tensor, const ov::Tensor& transformed_tensor, const std::string& device); - // Check if there is an allocated and transformed tensor - bool has(const LazyTensor& tensor, const std::string& device); private: - // Default CPU bank. Filled by update(). Owns CPU memory - std::unordered_map> m_bank; // Bank for specified device and their allocated memory std::unordered_map> m_device_bank; std::mutex m_mutex; std::shared_ptr m_core = nullptr; std::shared_ptr m_remote_ctx = nullptr; - bool m_alloc_allowed = false; + std::string m_alloc_device; }; std::shared_ptr bank(const std::string& bank_name, const std::shared_ptr& core, - bool alloc_allowed); + const std::string& alloc_device); } // namespace weights } // namespace npuw