Skip to content

Commit

Permalink
Address review comments part 1
Browse files Browse the repository at this point in the history
  • Loading branch information
smirnov-alexey committed Oct 2, 2024
1 parent 6b02839 commit 1dc99f2
Show file tree
Hide file tree
Showing 11 changed files with 112 additions and 201 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale,
DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
DEFINE_OPT(NPUW_PARALLEL_COMPILE, bool, false, npuw::parallel_compilation, CompileTime);
DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, bool, false, npuw::weights_bank_alloc, CompileTime);
DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ static constexpr ov::Property<std::string> weights_bank{"NPUW_WEIGHTS_BANK"};
/**
* @brief
* Type: std::string.
* Specify if weights bank is allowed to allocate NPU memory.
* Specify device name for weights bank which is used to allocate memory.
* Default value: false.
*/
static constexpr ov::Property<bool> weights_bank_alloc{"NPUW_WEIGHTS_BANK_ALLOC"};
static constexpr ov::Property<std::string> weights_bank_alloc{"NPUW_WEIGHTS_BANK_ALLOC"};

namespace partitioning {
namespace online {
Expand Down
3 changes: 1 addition & 2 deletions src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ void ZeroRemoteTensor::allocate(const size_t bytes) {
ze_host_mem_alloc_flag_t flag = ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED;
desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, static_cast<ze_host_mem_alloc_flags_t>(flag)};
} else {
ze_host_mem_alloc_flag_t flag = ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED;
desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, static_cast<ze_host_mem_alloc_flags_t>(flag)};
desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, 0};
}
zeroUtils::throwOnFail("zeMemAllocHost",
zeMemAllocHost(_init_structs->getContext(), &desc, size, STANDARD_PAGE_SIZE, &_data));
Expand Down
26 changes: 4 additions & 22 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,

// Initialize weights bank
const std::string weights_bank_opt = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK>();
bool wbank_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
const std::string wbank_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
m_weights_bank = ov::npuw::weights::bank(weights_bank_opt, plugin->get_core(), wbank_alloc);

LOG_VERB("*** Original model ***");
Expand Down Expand Up @@ -235,6 +235,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
} // for(ordered_subgraphs)
// NOTE(dm): there's a better way to do it, like we do in G-API backends.

m_update_required = m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false;

// Store mapping between manually splitted inputs/outputs
// to connect tensors between compiled submodels
m_submodels_input_to_prev_output = partitioning.input_to_prev_output;
Expand Down Expand Up @@ -289,8 +291,6 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
m_compiled_submodels[id].transformations = subgraph._transformations;
m_compiled_submodels[id].scales = subgraph._scales;
m_compiled_submodels[id].zerops = subgraph._zerops;
m_compiled_submodels[id].update_required.resize(subgraph._closure.size(),
m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false);
m_compiled_submodels[id].is_remote.resize(subgraph._closure.size(), false);
} // if(!funcall)

Expand Down Expand Up @@ -428,28 +428,10 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {

for (std::size_t tidx = 0; tidx < comp_model_desc.transformations.size(); ++tidx) {
const auto& lt = m_compiled_submodels[idx].transformations[tidx];

// FIXME: probably should be more careful with the devices here
if (!m_weights_bank->has(lt, *func_desc.device_it)) {
ov::Tensor evaled = lt.eval();
if (lt.has_concat()) {
// FIXME: probably setting just front() LazyTensor here is enough
for (const auto& lt_to_concat : lt.get_lt_to_concat()) {
// Note: this also includes this LazyTensor's original ov::Tensor
m_weights_bank->store(lt_to_concat, evaled, *func_desc.device_it);
}
} else {
m_weights_bank->store(lt, evaled, *func_desc.device_it);
}
}

m_compiled_submodels[idx].closure.push_back(m_weights_bank->get(lt, *func_desc.device_it));
// FIXME: should is_remote be set unconditionally?
m_compiled_submodels[idx].is_remote.push_back(true);
}

// After concat size might have changed
m_compiled_submodels[idx].update_required.resize(m_compiled_submodels[idx].closure.size(),
m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false);
});
}

Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ class CompiledModel : public ov::ICompiledModel {
std::vector<weights::LazyTensor> transformations;
std::vector<ov::Tensor> scales;
std::vector<ov::Tensor> zerops;
std::vector<bool> update_required;
std::vector<bool> is_remote;

// FIXME: Take it out of structure
Expand All @@ -137,6 +136,8 @@ class CompiledModel : public ov::ICompiledModel {
};
std::vector<CompiledModelDesc> m_compiled_submodels;

bool m_update_required;

std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;
std::string m_ref_device;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com

// No update required to this tensor in runtime - so it can be set only once
// Will be utilized when there is no FOLDing
if (!comp_model_desc.update_required[cidx]) {
if (!m_npuw_model->m_update_required) {
// At this point closure already contains allocated and transformed tensor ready to be used
request->set_tensor(iport, ov::get_tensor_impl(closure));
}
Expand Down Expand Up @@ -498,7 +498,7 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request)
if (closure.get_element_type() != clparam->get_element_type()) {
// Remember where the unpack is required
closure_unpack_required.push_back(cidx);
} else if (comp_model_desc.update_required[cidx]) {
} else if (m_npuw_model->m_update_required) {
if (needs_copy(idx, cidx)) {
// Remember where copy is requried
closure_copy_required.push_back(cidx);
Expand Down
126 changes: 63 additions & 63 deletions src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "lazy_tensor.hpp"

using ov::npuw::weights::ConcatMeta;
using ov::npuw::weights::ConstPtr;
using ov::npuw::weights::LTData;
using ov::npuw::weights::LazyTensor;
using ov::npuw::weights::Transform;
using ov::npuw::weights::TransformType;
Expand All @@ -21,22 +23,33 @@ std::size_t LazyTensor::Hash::operator()(const LazyTensor& lt) const {
seed ^= std::hash<std::size_t>()(axis) + 0x9e3779b9;
}
} else if (tr.first == TransformType::CONCAT) {
// concat tag can be different, no need to hash it
const auto& axis = std::get<ConcatMeta>(tr.second).second;
seed ^= std::hash<std::size_t>()(axis) + 0x9e3779b9;
for (const auto& lt : std::get<ConcatMeta>(tr.second).first) {
seed ^= LazyTensor::Hash::operator()(lt) + 0x9e3779b9;
}
}
}
return seed;
}

LazyTensor::LazyTensor(const TransformType& type, const Transform& transform) {
// Sanity check
NPUW_ASSERT(type == TransformType::TENSOR && std::holds_alternative<ov::Tensor>(transform));
m_transforms.push_back({type, transform});
const auto& tensor = std::get<ov::Tensor>(transform);
m_orig_data = tensor.data();
m_orig_shape = tensor.get_shape();
m_orig_type = tensor.get_element_type();
if (type == TransformType::TENSOR && std::holds_alternative<LTData>(transform)) {
m_transforms.push_back({type, transform});
ov::Tensor tensor;
if (std::holds_alternative<ConstPtr>(std::get<LTData>(transform))){
tensor = ov::npuw::util::tensor_from_const(std::get<ConstPtr>(std::get<LTData>(transform)));
} else {
tensor = std::get<ov::Tensor>(std::get<LTData>(transform));
}
m_orig_data = tensor.data();
m_orig_shape = tensor.get_shape();
m_orig_type = tensor.get_element_type();
} else if (type == TransformType::CONCAT && std::holds_alternative<ConcatMeta>(transform)) {
m_transforms.push_back({type, transform});
} else {
NPUW_ASSERT(false);
}
}

bool LazyTensor::operator==(const LazyTensor& other) const {
Expand All @@ -56,11 +69,19 @@ bool LazyTensor::operator==(const LazyTensor& other) const {
return false;
}
} else if (m_transforms[i].first == TransformType::CONCAT) {
// concat tag can be different, no need to compare it
if (std::get<ConcatMeta>(m_transforms[i].second).second !=
std::get<ConcatMeta>(other.m_transforms[i].second).second) {
const auto& m1 = std::get<ConcatMeta>(m_transforms[i].second);
const auto& m2 = std::get<ConcatMeta>(other.m_transforms[i].second);
if (m1.second != m2.second) {
return false;
}
if (m1.first.size() != m2.first.size()) {
return false;
}
for (std::size_t mi = 0; mi < m1.first.size(); ++mi) {
if (!(m1.first[mi] == m2.first[mi])) {
return false;
}
}
}
}

Expand All @@ -70,8 +91,7 @@ bool LazyTensor::operator==(const LazyTensor& other) const {
void LazyTensor::update(const TransformType& type, const Transform& transform) {
// Sanity check
NPUW_ASSERT((type == TransformType::PERMUTE && std::holds_alternative<std::vector<std::size_t>>(transform)) ||
(type == TransformType::CONVERT && std::holds_alternative<std::monostate>(transform)) ||
(type == TransformType::CONCAT && std::holds_alternative<ConcatMeta>(transform)));
(type == TransformType::CONVERT && std::holds_alternative<std::monostate>(transform)));
m_transforms.push_back({type, transform});
}

Expand All @@ -86,76 +106,56 @@ ov::Tensor LazyTensor::eval() const {
Perhaps it should be done after model compilation and not handled here.
*/

// Sanity check
NPUW_ASSERT(std::holds_alternative<ov::Tensor>(m_transforms.front().second));

ov::Tensor transformed = get_orig_tensor();
ov::Tensor transformed;
ov::Tensor tnew;
for (auto& tr : m_transforms) {

NPUW_ASSERT(!m_transforms.empty());

// Process the initial tensor - either from Const or from Concat
if (m_transforms.front().first == TransformType::TENSOR) {
transformed = get_orig_tensor();
} else if (m_transforms.front().first == TransformType::CONCAT) {
std::vector<ov::Tensor> to_concat;
for (const auto& lt : std::get<ConcatMeta>(m_transforms.front().second).first) {
// Sanity check
NPUW_ASSERT(!lt.has_transformations());
to_concat.push_back(lt.get_orig_tensor());
}
transformed = ov::npuw::util::concat(to_concat, std::get<ConcatMeta>(m_transforms.front().second).second);
} else {
NPUW_ASSERT(false);
}

// Process transformation on top of initial tensor
for (std::size_t i = 1; i < m_transforms.size(); ++i) {
const auto& tr = m_transforms[i];
switch (tr.first) {
case TransformType::TENSOR:
continue;
case TransformType::PERMUTE:
tnew = ov::npuw::util::permute(transformed, std::get<std::vector<std::size_t>>(tr.second));
tnew.copy_to(transformed);
case TransformType::CONVERT:
tnew = ov::npuw::util::to_f16(transformed);
tnew.copy_to(transformed);
case TransformType::CONCAT:
tnew = ov::npuw::util::concat(get_to_concat(), std::get<ConcatMeta>(tr.second).second);
tnew.copy_to(transformed);
default:
NPUW_ASSERT(false);
}
}

return transformed;
}

void* LazyTensor::get_orig_data() const {
return m_orig_data;
}

ov::Tensor LazyTensor::get_orig_tensor() const {
// Sanity check
NPUW_ASSERT(std::holds_alternative<ov::Tensor>(m_transforms.front().second));
return std::get<ov::Tensor>(m_transforms.front().second);
}

bool LazyTensor::has_concat() const {
for (auto& tr : m_transforms) {
if (tr.first == TransformType::CONCAT) {
return true;
}
NPUW_ASSERT(!has_transformations());
if (std::holds_alternative<ConstPtr>(std::get<LTData>(m_transforms.front().second))){
return ov::npuw::util::tensor_from_const(std::get<ConstPtr>(std::get<LTData>(m_transforms.front().second)));
}
return false;
return std::get<ov::Tensor>(std::get<LTData>(m_transforms.front().second));
}

bool LazyTensor::has_transformations() const {
// The first transformation is always initial Tensor
return m_transforms.size() > 1;
}

std::vector<ov::Tensor> LazyTensor::get_to_concat() const {
NPUW_ASSERT(has_concat());
std::vector<ov::Tensor> to_concat;
for (auto& tr : m_transforms) {
if (tr.first == TransformType::CONCAT) {
for (const auto& lt : std::get<ConcatMeta>(tr.second).first) {
to_concat.push_back(lt.get_orig_tensor());
}
}
}
return to_concat;
}

std::vector<LazyTensor> LazyTensor::get_lt_to_concat() const {
NPUW_ASSERT(has_concat());
for (auto& tr : m_transforms) {
if (tr.first == TransformType::CONCAT) {
return std::get<ConcatMeta>(tr.second).first;
}
// The first transformation is always initial Tensor or Concat
if (m_transforms.size() == 1 && m_transforms.front().first == TransformType::TENSOR) {
return false;
}
NPUW_ASSERT(false);
return {};
return true;
}
11 changes: 5 additions & 6 deletions src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ enum class TransformType : int { TENSOR, PERMUTE, CONVERT, CONCAT };
class LazyTensor;

using ConcatMeta = std::pair<std::vector<LazyTensor>, std::size_t>;
using ConstPtr = std::shared_ptr<ov::op::v0::Constant>;
using LTData = std::variant<ConstPtr, ov::Tensor>;

using Transform = std::variant<ov::Tensor, std::vector<std::size_t>, std::monostate, ConcatMeta>;
// LazyTensor owns Constant's memory
using Transform = std::variant<LTData, std::vector<std::size_t>, std::monostate, ConcatMeta>;

class LazyTensor {
public:
Expand All @@ -43,17 +46,13 @@ class LazyTensor {
void update(const TransformType& type, const Transform& transform);
ov::Tensor eval() const;

void* get_orig_data() const;
ov::Tensor get_orig_tensor() const;

bool has_concat() const;
bool has_transformations() const;
std::vector<ov::Tensor> get_to_concat() const;
std::vector<LazyTensor> get_lt_to_concat() const;

private:
std::vector<std::pair<TransformType, Transform>> m_transforms;
void* m_orig_data;
void* m_orig_data = nullptr;
ov::Shape m_orig_shape;
ov::element::Type m_orig_type;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1461,8 +1461,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) {

LOG_DEBUG("Register " << prod_output << " in the function closure");
funcall._transformations.push_back(LazyTensor(
TransformType::TENSOR,
bank->update(std::dynamic_pointer_cast<ov::op::v0::Constant>(input_node)))); // (n)/1/i/c
TransformType::TENSOR, std::dynamic_pointer_cast<ov::op::v0::Constant>(input_node))); // (n)/1/i/c
} else if (ov::op::util::is_parameter(input_node)) {
LOG_DEBUG("Handling a Parameter input " << prod_output);
LOG_BLOCK();
Expand Down Expand Up @@ -1559,8 +1558,7 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) {
LOG_DEBUG("Register " << prod_output << " in the function closure[" << param_idx
<< "] (via prototype " << proto_layer_name << ")");
funcall._transformations[param_idx - function._param_offset] = LazyTensor(
TransformType::TENSOR,
bank->update(std::dynamic_pointer_cast<ov::op::v0::Constant>(input_node))); // (t)/1/c
TransformType::TENSOR, std::dynamic_pointer_cast<ov::op::v0::Constant>(input_node)); // (t)/1/c
}
} // for (inputs)
} // for(nodes)
Expand Down Expand Up @@ -1650,18 +1648,15 @@ void Partitioner::optimize(const std::string& func_name) {
auto& funcall = func_group.refs[f_idx].get();
std::vector<LazyTensor> to_concat;
// Fill tensor vector
for (auto&& cidx : to_concat_idx) {
to_concat.push_back(funcall._transformations[cidx]);
}
// Set to lazy tensor history
for (auto&& cidx : to_concat_idx) {
// FIXME: Assuming here concat goes first and other transformations later.
// This allows to store ov::Tensor and ignore their potential history of transformations
funcall._transformations[cidx].update(TransformType::CONCAT, std::make_pair(to_concat, axis));
NPUW_ASSERT(!funcall._transformations[cidx].has_transformations());
to_concat.push_back(funcall._transformations[cidx]);
}
// Pick the first (could be any) LazyTensor and set as new future-concatenated tensor
// Note: we can ignore updating funcall._transformations[cidx] here since those LazyTensors will be gone and the new one added into the vector
if (!to_concat.empty()) {
funcall._transformations.push_back(to_concat.front());
funcall._transformations.push_back(LazyTensor(TransformType::CONCAT, std::make_pair(to_concat, axis)));
}
});
}
Expand Down
Loading

0 comments on commit 1dc99f2

Please sign in to comment.