diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp index 72a62781580cda..e7e5121b1240e7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp @@ -16,8 +16,6 @@ namespace ov { namespace npuw { namespace online { -class Group; // forward declaration - namespace detail { // At partitioning level we exclude some "non-Ops" to not interfere with the passes. // We include some of them back to properly link everything at plugin level @@ -33,6 +31,8 @@ class Snapshot : public std::enable_shared_from_this { m_node_to_prod_cons(std::make_shared()), m_node_to_gr(std::make_shared()) {} + friend class Group; // forward declaration + // Simple passes void singleGroup(); @@ -49,27 +49,27 @@ class Snapshot : public std::enable_shared_from_this { void repeatedBlocks(); void earlyAvoids(); void earlyRegroup(); - void markInternalCompute(); - void resetExcludedRep(); // Utility std::shared_ptr getGraph() const; - size_t graphSize() const; - const detail::OVNodeSet& getNodeProducers(const detail::OVNodePtr& node) const; - const detail::OVNodeSet& getNodeConsumers(const detail::OVNodePtr& node) const; const detail::OVPortsMap& getPortsMap() const; const detail::OVNodeToGroupMapPtr& getNodeToGroupMap() const; const std::map>>& getMatches() const; - detail::GPtrSet getRepGroups(const std::shared_ptr& group) const; void repeat(detail::Pass&& pass); void setCtx(const PassContext& ctx); + size_t graphSize() const; private: + detail::GPtrSet getRepGroups(const std::shared_ptr& group) const; + const detail::OVNodeSet& getNodeProducers(const detail::OVNodePtr& node) const; + const detail::OVNodeSet& getNodeConsumers(const detail::OVNodePtr& node) const; void identifyUniques(); void mergeUniques(); void mergeTriangles(); void cleanUpUniques(); void afterUniques(); + void markInternalCompute(); + void resetExcludedRep(); bool cleanUpUniquesImpl(const detail::GPtrSet& gset); std::shared_ptr tryGrowRepeatingGroups(const detail::GPtrSet& repeating_groups); std::shared_ptr tryMergeTriangles(const detail::GPtrSet& repeating_groups); diff --git a/src/plugins/intel_npu/tests/CMakeLists.txt b/src/plugins/intel_npu/tests/CMakeLists.txt index 4c41f008eb7f81..0f5bd7a6b093b2 100644 --- a/src/plugins/intel_npu/tests/CMakeLists.txt +++ b/src/plugins/intel_npu/tests/CMakeLists.txt @@ -8,3 +8,4 @@ if (MSVC) ov_add_compiler_flags(/wd5105) endif() add_subdirectory(functional) +add_subdirectory(unit) diff --git a/src/plugins/intel_npu/tests/unit/CMakeLists.txt b/src/plugins/intel_npu/tests/unit/CMakeLists.txt new file mode 100644 index 00000000000000..861a0ff6a47076 --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/CMakeLists.txt @@ -0,0 +1,46 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +set(TARGET_NAME "ov_npu_unit_tests") + +set(MANDATORY_UNIT_TESTS_LIBS + "openvino::commonTestUtils" + "openvino::gmock" + "openvino::gtest" + "openvino::gtest_main" + "openvino::runtime" + "openvino::npu_al" + "openvino::npu_logger_utils" +) + +ov_add_test_target( + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + ADDITIONAL_SOURCE_DIRS + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/plugin/npuw/ + DEPENDENCIES + openvino::runtime + INCLUDES + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/npuw + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/plugin/npuw + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/utils/include + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/plugin/include + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/al/include + LINK_LIBRARIES + ${MANDATORY_UNIT_TESTS_LIBS} + LABELS + NPUW +) + +if(ENABLE_AVX2) + ov_avx2_optimization_flags(avx2_flags) + target_compile_options(${TARGET_NAME} PRIVATE "${avx2_flags}") +endif() + +install(TARGETS ${TARGET_NAME} + RUNTIME DESTINATION tests + COMPONENT tests + EXCLUDE_FROM_ALL +) diff --git a/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp b/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp new file mode 100644 index 00000000000000..af1fc5de8e92c7 --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp @@ -0,0 +1,692 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include "partitioning/online/compiler.hpp" +#include "partitioning/online/snapshot.hpp" +#include "partitioning/online/group.hpp" + +#include "intel_npu/al/config/config.hpp" +#include "intel_npu/al/config/npuw.hpp" + +#include "openvino/openvino.hpp" +#include "openvino/op/ops.hpp" +#include "openvino/op/util/op_types.hpp" + +bool isEqualEns(ov::npuw::Ensemble& ens1, ov::npuw::Ensemble& ens2); +bool isEqualEns(ov::npuw::Ensemble& ens1, ov::npuw::Ensemble& ens2) { + if (ens1.groups.size() != ens2.groups.size()) { + return false; + } + + for (auto& g : ens1.groups) { + std::sort(g.input_layers.begin(), g.input_layers.end()); + std::sort(g.output_layers.begin(), g.output_layers.end()); + std::sort(g.all_layers.begin(), g.all_layers.end()); + } + + for (auto& g : ens2.groups) { + std::sort(g.input_layers.begin(), g.input_layers.end()); + std::sort(g.output_layers.begin(), g.output_layers.end()); + std::sort(g.all_layers.begin(), g.all_layers.end()); + } + + std::sort(ens1.groups.begin(), ens1.groups.end(), [](const ov::npuw::Group& g1, + const ov::npuw::Group& g2){ + return g1.all_layers.front() < g2.all_layers.front(); + }); + + std::sort(ens2.groups.begin(), ens2.groups.end(), [](const ov::npuw::Group& g1, + const ov::npuw::Group& g2){ + return g1.all_layers.front() < g2.all_layers.front(); + }); + + for (size_t i = 0; i < ens1.groups.size(); ++i) { + const auto& g1 = ens1.groups.at(i); + const auto& g2 = ens2.groups.at(i); + + if (g1.avoid_list != g2.avoid_list || + g1.input_layers != g2.input_layers || + g1.output_layers != g2.output_layers || + g1.all_layers != g2.all_layers) { + return false; + } + + // Can't compare them directly since they are random, but dont't affect the structure + if ((g1.repeated_id.empty() && !g2.repeated_id.empty()) || + (!g1.repeated_id.empty() && g2.repeated_id.empty())) { + return false; + } + } + + if (ens1.repeated.size() != ens2.repeated.size()) { + return false; + } + + auto get_sorted_rep = [](const std::map& rep) { + std::vector>> sorted_rep; + + std::transform(rep.begin(), rep.end(), std::back_inserter(sorted_rep), [](const auto& v) { + return v.second.matches; + }); + + for (auto& g : sorted_rep) { + std::sort(g.begin(), g.end(), + [](const auto& a, const auto& b) {return *a.begin() < *b.begin();}); + } + + std::sort(sorted_rep.begin(), sorted_rep.end(), + [](const auto& a, const auto& b) {return *a.front().begin() < *b.front().begin();}); + + return sorted_rep; + }; + + + if (get_sorted_rep(ens1.repeated) != get_sorted_rep(ens2.repeated)) { + return false; + } + + return true; +} + +class ModelGenerator { +public: + ModelGenerator() = default; + + std::shared_ptr get_model_without_repeated_blocks() { + std::shared_ptr input = std::make_shared(ov::element::i32, ov::Shape{1, 1, 40}); + m_nodes.push_back(input); + set_name(input); + + std::shared_ptr res = get_block(input); + + auto result = std::make_shared(res); + m_nodes.push_back(result); + set_name(result); + + ov::ParameterVector params = {input}; + ov::ResultVector results = {result}; + + return std::make_shared(results, params); + } + + std::shared_ptr get_model_with_repeated_blocks() { + // Generate head + std::shared_ptr input = std::make_shared(ov::element::i32, ov::Shape{1, 1, 40}); + m_nodes.push_back(input); + set_name(input); + + std::vector> head(7, nullptr); + head[0] = std::make_shared(input, input); + head[1] = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{2}); + head[2] = std::make_shared(head[0], head[1], true); + head[3] = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1, 1, 4, 10}); + head[4] = std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{1, 1, 40}); + head[5] = std::make_shared(head[2], head[3], false); + head[6] = std::make_shared(head[5], head[4], false); + + for (const auto& h : head) { + m_nodes.push_back(h); + set_name(h); + } + + // Generate repeated blocks + std::shared_ptr output = get_block(head[6]); + std::vector> outputs; + outputs.push_back(output); + + for (size_t i = 0; i < 9; ++i) { + output = get_block(output); + outputs.push_back(output); + } + + // Generate tail + std::vector> tail(6, nullptr); + tail[0] = std::make_shared(outputs, -1); + tail[1] = std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 20, 20}); + tail[2] = std::make_shared(tail[0], tail[1], false); + tail[3] = std::make_shared(ov::element::i32, ov::Shape{1, 1, 1}); + tail[4] = std::make_shared(tail[2], tail[3]); + tail[5] = std::make_shared(tail[4], tail[4]); + + for (const auto& t : tail) { + m_nodes.push_back(t); + set_name(t); + } + + // Create model + auto result = std::make_shared(tail[5]); + m_nodes.push_back(result); + set_name(result); + + ov::ParameterVector params = {input}; + ov::ResultVector results = {result}; + + return std::make_shared(results, params); + } + + std::shared_ptr get_block(const std::shared_ptr& input) { + // Parameters + // input + + // Constants + std::vector> model_c(18, nullptr); + model_c[0] = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{0, 2, 1, 3}); + model_c[1] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{1}); + model_c[2] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{0}); + model_c[3] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{2}); + model_c[4] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{0}); + model_c[5] = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1, 1, 1, 1}); + model_c[6] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{1}); + model_c[7] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{0}); + model_c[8] = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1, 1, 1, 1}); + model_c[9] = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 1, 1, 2}); + model_c[10] = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 1, 1, 1}); + model_c[11] = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 1, 1, 2}); + model_c[12] = std::make_shared(ov::element::i32, ov::Shape{1, 1, 1, 1}); + model_c[13] = std::make_shared(ov::element::i32, ov::Shape{1, 1, 1, 1}); + model_c[14] = std::make_shared(ov::element::i32, ov::Shape{1, 1, 1, 1}); + model_c[15] = std::make_shared(ov::element::f32, ov::Shape{40, 40}); + model_c[16] = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1, 1, 4, 10}); + model_c[17] = std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 40}); + + for (const auto& c : model_c) { + m_nodes.push_back(c); + set_name(c); + } + + // Converts + std::vector> convert(3, nullptr); + convert[0] = std::make_shared(model_c[15], ov::element::f16); + convert[1] = std::make_shared(convert[0], ov::element::i32); + convert[2] = std::make_shared(model_c[12], ov::element::i32); + + for (const auto& c : convert) { + m_nodes.push_back(c); + set_name(c); + } + + // Ops + std::vector> op(16, nullptr); + op[0] = std::make_shared(input, convert[1], false, true); + op[1] = std::make_shared(op[0], model_c[16], false); + op[2] = std::make_shared(op[1], model_c[0]); + op[3] = std::make_shared(op[2]); + op[4] = std::make_shared(op[3], model_c[1], model_c[2]); + op[5] = std::make_shared(op[4], model_c[3], true); + op[6] = std::make_shared(op[5]); + op[7] = std::make_shared(model_c[5], model_c[6], op[6], model_c[7]); + op[8] = std::make_shared(op[2], + model_c[8], + op[7], + model_c[9], + std::vector{1, 1, 1, 1}, + std::vector{1, 1, 1, 1}); + op[9] = std::make_shared(op[2], + op[7], + model_c[10], + model_c[11], + std::vector{1, 1, 1, 1}, + std::vector{1, 1, 1, 1}); + op[10] = std::make_shared(op[9], convert[2]); + op[11] = std::make_shared(std::vector>{op[10], op[8]}, -1); + op[12] = std::make_shared(model_c[13], op[11]); + op[13] = std::make_shared(model_c[14], op[2]); + op[14] = std::make_shared(op[13], op[12]); + op[15] = std::make_shared(op[14], model_c[17], false); + + for (const auto& o : op) { + m_nodes.push_back(o); + set_name(o); + } + + return op[15]; + } + +private: + void set_name(const std::shared_ptr& node) { + node->set_friendly_name("node_" + std::to_string(m_name_idx++)); + } + + std::vector> m_nodes; + size_t m_name_idx; +}; + +TEST(OnlinePartitioningTest, Partitioning_IsTheSame_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto opt_desc = std::make_shared<::intel_npu::OptionsDesc>(); + auto cfg = ::intel_npu::Config(opt_desc); + ::intel_npu::registerNPUWOptions(*opt_desc); + std::map cfg_map = {{ "NPUW_ONLINE_KEEP_BLOCK_SIZE", "9" }}; + cfg.update(cfg_map); + + auto ens = ov::npuw::online::buildPartitioning(model, cfg); + + for (size_t i = 0; i < 100; ++i) { + auto ens_again = ov::npuw::online::buildPartitioning(model, cfg); + EXPECT_TRUE(isEqualEns(ens, ens_again)); + } +} + +TEST(OnlinePartitioningTest, Partitioning_IsTheSame_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto opt_desc = std::make_shared<::intel_npu::OptionsDesc>(); + auto cfg = ::intel_npu::Config(opt_desc); + ::intel_npu::registerNPUWOptions(*opt_desc); + std::map cfg_map = {{ "NPUW_ONLINE_KEEP_BLOCK_SIZE", "9" }}; + cfg.update(cfg_map); + + auto ens = ov::npuw::online::buildPartitioning(model, cfg); + + for (size_t i = 0; i < 100; ++i) { + auto ens_again = ov::npuw::online::buildPartitioning(model, cfg); + EXPECT_TRUE(isEqualEns(ens, ens_again)); + } +} + +TEST(OnlinePartitioningTest, Partitioning_SingleGroup_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->singleGroup(); + EXPECT_EQ(snap->graphSize(), 1); +} + +TEST(OnlinePartitioningTest, Partitioning_SingleGroup_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->singleGroup(); + EXPECT_EQ(snap->graphSize(), 1); +} + +TEST(OnlinePartitioningTest, Partitioning_buildGraph_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + auto g = snap->getGraph(); + for (const auto& nh : g->sorted()) { + ov::npuw::online::Group::GPtr group = g->meta(nh).get(); + EXPECT_EQ(group->size(), 1); + } + EXPECT_EQ(snap->getNodeToGroupMap()->size(), snap->graphSize()); +} + +TEST(OnlinePartitioningTest, Partitioning_buildGraph_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + auto g = snap->getGraph(); + for (const auto& nh : g->sorted()) { + ov::npuw::online::Group::GPtr group = g->meta(nh).get(); + EXPECT_EQ(group->size(), 1); + } + EXPECT_EQ(snap->getNodeToGroupMap()->size(), snap->graphSize()); +} + +TEST(OnlinePartitioningTest, Partitioning_earlyAvoids_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + ov::npuw::online::PassContext ctx; + ctx.avoids = {{ov::npuw::online::PatternType::OP, "Gather", "mydevice"}, {ov::npuw::online::PatternType::OP, "MatMul", "mydevice"}}; + snap->setCtx(ctx); + snap->buildGraph(); + snap->earlyAvoids(); + auto g = snap->getGraph(); + size_t count = 0; + for (const auto& nh : g->sorted()) { + ov::npuw::online::Group::GPtr group = g->meta(nh).get(); + EXPECT_EQ(group->size(), 1); + if (group->avoidedTargets().size() == 1 && *(group->avoidedTargets().begin()) == "mydevice") { + ++count; + } + } + EXPECT_EQ(count, 2); +} + +TEST(OnlinePartitioningTest, Partitioning_earlyAvoids_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + ov::npuw::online::PassContext ctx; + ctx.avoids = {{ov::npuw::online::PatternType::OP, "Gather", "mydevice"}, {ov::npuw::online::PatternType::OP, "MatMul", "mydevice"}}; + snap->setCtx(ctx); + snap->buildGraph(); + snap->earlyAvoids(); + auto g = snap->getGraph(); + size_t count = 0; + for (const auto& nh : g->sorted()) { + ov::npuw::online::Group::GPtr group = g->meta(nh).get(); + EXPECT_EQ(group->size(), 1); + if (group->avoidedTargets().size() == 1 && *(group->avoidedTargets().begin()) == "mydevice") { + ++count; + } + } + EXPECT_EQ(count, 20); +} + +TEST(OnlinePartitioningTest, Partitioning_collectLHF_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {10, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->collectLHF(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_collectLHF_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {82, 82}; + size_t iter = 0; + + snap->repeat([&]{ + snap->collectLHF(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseRemnants_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {10, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseRemnants(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseRemnants_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {75, 38, 19, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseRemnants(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseRemnantsExtended_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {10, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseRemnantsExtended(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseRemnantsExtended_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {10, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseRemnantsExtended(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseInputs_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {15, 14, 14}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseInputs(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseInputs_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {148, 138, 138}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseInputs(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_Just_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes_lhf = {10, 10}; + size_t iter_lhf = 0; + + std::vector sizes_fr = {10, 10}; + size_t iter_fr = 0; + + snap->repeat([&] { + snap->collectLHF(); + EXPECT_LT(iter_lhf, sizes_lhf.size()); + EXPECT_EQ(snap->graphSize(), sizes_lhf[iter_lhf++]); + }); + snap->repeat([&] { + snap->fuseRemnants(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_Just_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes_lhf = {82, 82}; + size_t iter_lhf = 0; + + std::vector sizes_fr = {41, 21, 11, 10, 10}; + size_t iter_fr = 0; + + snap->repeat([&] { + snap->collectLHF(); + EXPECT_LT(iter_lhf, sizes_lhf.size()); + EXPECT_EQ(snap->graphSize(), sizes_lhf[iter_lhf++]); + }); + snap->repeat([&] { + snap->fuseRemnants(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_RepeatedBlocks_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + + std::vector sizes_fr = {10, 10}; + size_t iter_fr = 0; + + snap->earlyAvoids(); + snap->earlyRegroup(); + snap->repeatedBlocks(); + EXPECT_EQ(snap->graphSize(), 17); + + auto matches = snap->getMatches(); + EXPECT_EQ(matches.size(), 0); + + snap->repeat([&] { + snap->fuseRemnantsExtended(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_RepeatedBlocks_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + + std::vector sizes_fr = {12, 12}; + size_t iter_fr = 0; + + snap->earlyAvoids(); + snap->earlyRegroup(); + snap->repeatedBlocks(); + EXPECT_EQ(snap->graphSize(), 18); + + auto matches = snap->getMatches(); + EXPECT_EQ(matches.size(), 1); + + for (const auto& m : matches) { + EXPECT_EQ(m.second.size(), 17); + for (const auto& layers : m.second) { + EXPECT_EQ(layers.size(), 10); + } + } + + snap->repeat([&] { + snap->fuseRemnantsExtended(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_Compute_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + + std::vector sizes_fr = {10, 10}; + size_t iter_fr = 0; + + ov::npuw::online::PassContext ctx; + ctx.isolates = {{ov::npuw::online::PatternType::OP, "Transpose", "test_compute"}, {ov::npuw::online::PatternType::OP, "ScatterUpdate", "test_compute"}}; + ctx.nofolds = {"test_compute"}; + snap->setCtx(ctx); + + snap->buildGraph(); + snap->earlyAvoids(); + snap->earlyRegroup(); + snap->repeatedBlocks(); + EXPECT_EQ(snap->graphSize(), 17); + + auto matches = snap->getMatches(); + EXPECT_EQ(matches.size(), 0); + + snap->repeat([&] { + snap->fuseRemnantsExtended(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_Compute_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + + std::vector sizes_fr = {10, 10}; + size_t iter_fr = 0; + + ov::npuw::online::PassContext ctx; + ctx.isolates = {{ov::npuw::online::PatternType::OP, "Gather", "test_compute"}, + {ov::npuw::online::PatternType::OP, "ScatterUpdate", "test_compute"}, + {ov::npuw::online::PatternType::OP, "ShapeOf", "test_compute"}, + {ov::npuw::online::PatternType::OP, "Divide", "test_compute"}, + {ov::npuw::online::PatternType::OP, "Floor", "test_compute"}}; + ctx.nofolds = {"test_compute"}; + snap->setCtx(ctx); + + snap->buildGraph(); + snap->earlyAvoids(); + snap->earlyRegroup(); + snap->repeatedBlocks(); + EXPECT_EQ(snap->graphSize(), 29); + + // FIXME: create a config in which there will be repeated blocks + auto matches = snap->getMatches(); + EXPECT_EQ(matches.size(), 0); + + snap->repeat([&] { + snap->fuseRemnantsExtended(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} diff --git a/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp new file mode 100644 index 00000000000000..1049832f6ead7c --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp @@ -0,0 +1,103 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef HAVE_AVX2 +#include "unpack.hpp" + +namespace { + +const auto TestCases = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::i4}), + ::testing::ValuesIn({ov::element::Type_t::i8, ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::undefined}), // no used in this test + ::testing::ValuesIn({ov::element::Type_t::undefined}), // no used in this test + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={1, 1, 1, 32};}, + Tensors{input={1,1,1, 128};}, + Tensors{input={1,1,1, 390};}, + Tensors{input={1,1,1, 82};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackTests, UnpackTests, + TestCases, + UnpackTests::getTestCaseName); + +const auto TestCasesScale = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::i4}), // TODO: add i8 as input for test + ::testing::ValuesIn({ov::element::Type_t::f16, ov::element::Type_t::f32}), + ::testing::ValuesIn({ov::element::Type_t::f16, ov::element::Type_t::f32}), + ::testing::ValuesIn({ov::element::Type_t::undefined}), // no used in this test + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={1,32, 128}; scale = {1, 32, 1};}, + Tensors{input={32, 128}; scale = {32, 1};}, + Tensors{input={64, 160}; scale = {64, 1};}, + Tensors{input={1024, 4}; scale = {64, 1};}, + Tensors{input={1, 1, 1024, 4}; scale = {1, 1, 64, 1};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackWithScaleTests, UnpackWithScaleTests, + TestCasesScale, + UnpackWithScaleTests::getTestCaseName); + + +const auto TestCasesScaleAndZeroPoints = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={1,32, 128}; scale = {1, 32, 1};}, + Tensors{input={1,64, 160}; scale = {1, 64, 1};}, + Tensors{input={1,1024, 4}; scale = {1, 64, 1};}, + Tensors{input={1,1, 1024, 4}; scale = {1, 1, 64, 1};}, + Tensors{input={64, 1}; scale = {64, 1};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPoint, UnpackTestsWithScaleAndZeroPoint, + TestCasesScaleAndZeroPoints, + UnpackTestsWithScaleAndZeroPoint::getTestCaseName); + +const auto TestCasesScaleAndZeroPoints2 = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::f32}), + ::testing::ValuesIn({ov::element::Type_t::f32}), + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={32, 32, 64}; scale = {32, 1, 64};}, + Tensors{input={64, 64, 128}; scale = {64, 1, 128};}, + Tensors{input={64, 32, 32}; scale = {64, 1, 32};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPointTest2, UnpackTestsWithScaleAndZeroPointTest2, + TestCasesScaleAndZeroPoints2, + UnpackTestsWithScaleAndZeroPointTest2::getTestCaseName); + +const auto TestCasesScaleAndZeroPoints3 = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={1, 32, 128}; scale = {1, 32, 1}; zerop = {1, 32, 1};}, + Tensors{input={16, 64, 64}; scale = {16, 64, 1}; zerop = {16, 64, 1};}, + Tensors{input={1, 1024, 4}; scale = {1, 64, 1}; zerop = {1, 32, 1};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPointTest3, UnpackTestsWithScaleAndZeroPointTest3, + TestCasesScaleAndZeroPoints3, + UnpackTestsWithScaleAndZeroPointTest3::getTestCaseName); + +} // anonymous namespace + +#endif // __AVX2__ diff --git a/src/plugins/intel_npu/tests/unit/npuw/unpack.hpp b/src/plugins/intel_npu/tests/unit/npuw/unpack.hpp new file mode 100644 index 00000000000000..da5bb4e4720f3e --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/unpack.hpp @@ -0,0 +1,628 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include +#include + +#include "openvino/runtime/make_tensor.hpp" + +#include "util.hpp" + +namespace { + +#define ASSERT_NO_THROW_WITH_MESSAGE(code) do{ \ + try {\ + code;\ + }catch (const std::exception &ex ) {\ + FAIL()<> 4) | ((x & (1 << 6)) >> 4) | ((x & (1 << 5)) >> 4) | ((x & (1 << 4)) >> 4); +} + +inline int8_t lo4(int8_t x) { + return (x & (1 << 3)) | (x & (1 << 2)) | (x & (1 << 1)) | (x & (1 << 0)); +} + +inline uint8_t hi4(uint8_t x) { + return x >> 4; +} + +inline uint8_t lo4(uint8_t x) { + return x & 0x0F; +} + +inline int8_t upc(int8_t h) { + return h | (-((h & (1 << 3)) >> 3) & (-8)); +} + +typedef unsigned short ushort; +typedef unsigned int uint; + +float half_to_float(const ushort x) { + + __m128i halfVector = _mm_cvtsi32_si128(x); + __m128 floatVector = _mm_cvtph_ps(halfVector); + return _mm_cvtss_f32(floatVector); +} + +ushort float_to_half(const float x) { + __m128 floatVector = _mm_set_ss(x); + __m128i halfVector = _mm_cvtps_ph(floatVector, _MM_FROUND_TO_NEAREST_INT); + return _mm_extract_epi16(halfVector, 0); +} + +inline uint16_t int2hfloat(int8_t x) +{ + float inputFl32 = static_cast(x); + float* inputFl32_ptr = &inputFl32; + unsigned int* fltInt32Ptr = reinterpret_cast(inputFl32_ptr); + unsigned int fltInt32 = *fltInt32Ptr; + unsigned short fltInt16; + + fltInt16 = (fltInt32 >> 31) << 5; + unsigned short tmp = (fltInt32 >> 23) & 0xff; + tmp = (tmp - 0x70) & ((unsigned int)((int)(0x70 - tmp) >> 4) >> 27); + fltInt16 = (fltInt16 | tmp) << 10; + fltInt16 |= (fltInt32 >> 13) & 0x3ff; + + return fltInt16; +} + + +void unpack(const int8_t* in, int8_t* out, int size) { + for (int i = 0; i < size / 2; i++) { + *(out++) = upc(lo4(*in)); + *(out++) = upc(hi4(*in)); + in++; + } +} + +void unpack_i4f16(const int8_t* in, int8_t* out, int size) { + uint16_t *hFloatOut = reinterpret_cast(out); + + for (int i = 0; i < size / 2; i++) { + *(hFloatOut++) = int2hfloat(upc(lo4(*in))); + *(hFloatOut++) = int2hfloat(upc(hi4(*in))); + in++; + } +} + +/*u4 order*/ +void unpack_u4f32(const int8_t* in, float* out, int size) { + for (int i = 0; i < size / 2; i++) { + *(out++) = static_cast(lo4(*in)); + *(out++) = static_cast(hi4(*in)); + in++; + } +} + +template +::testing::AssertionResult fp16ArraysMatch(const T &actual, + const T &expected, + const T &i4Input, + bool int4 = 1 /*i4 or u4*/){ + for (size_t i = 0; i < expected.size() / 2; ++i) { + + int int8Input[] ={ + details::lo4(i4Input[i / 2]), + details::hi4(i4Input[i / 2]) + }; + + if (int4) { + int8Input[0] = details::upc(int8Input[1]); + int8Input[1] = details::upc(int8Input[0]); + }; + + auto fp16ref = int{*((uint16_t*)expected.data() + i)}; + auto fp16out = int{*((uint16_t*)actual.data() + i)}; + +#define _P(x) std::dec << std::setw(5) << (x) << '(' << std::setw(4) << std::hex << (x) << ')' + if (fp16ref != fp16out) { + return ::testing::AssertionFailure() << std::dec << std::setw(4) << i << ", i4:" + << std::setw(2) << int8Input[i % 2] + << " | ref " << _P(fp16ref) + << ", test " << _P(fp16out) << "\n"; + } +#undef _P + + } + + return ::testing::AssertionSuccess(); +} + +} // namespace details + +using ShapesInitializer = std::function&, std::vector&, std::vector&)>; + + +using UnpackTestsParams = std::tuple< + ov::element::Type_t, // fromPrecision + ov::element::Type_t, // toPrecision + ov::element::Type_t, // scalePrecision + ov::element::Type_t, // zeroPointPrecision + unsigned long, // nPartitions + ShapesInitializer, // input_shape , scale_shape, zerop initializer + bool, // use parallel_for + bool // strict partitioning + >; + +class UnpackTestsBase { +protected: + ov::element::Type fromType; + ov::element::Type toType; + ov::element::Type scaleType; + ov::element::Type zeropType; + std::shared_ptr from, to, scale, zerop; + + std::vector input; + std::vector output; + std::vector ref_output; + std::vector scalesStorage; + std::vector zeropStorage; + float zeropValue; + ov::Shape input_shape; + ov::Shape scale_shape; + ov::Shape zerop_shape; + + size_t nPartitions; + bool useParallelFor = false; + bool strictPartitions = false; + + void make_zeropoints() { + if (zeropType == ov::element::undefined) { + return; + } + + const std::vector zeropValues = {15.0f, 12.0f, 0.0f, 31.0f}; + const size_t nElements = shape_size(zerop_shape); + + // Set zeropValue if there's only one element + if (nElements == 1) { + zeropValue = zeropValues.front(); + } + + // Determine the size of the storage based on the type and resize the storage vector + if (zeropType == ov::element::Type_t::u4) { + zeropStorage.resize((nElements + 1) / 2, 0); // Each u4 zeropoint is 4 bits, so two zeropoints fit in one byte + } else if (zeropType == ov::element::Type_t::f32) { + zeropStorage.resize(nElements * sizeof(float), 0); + } else { + ASSERT_TRUE(zeropType == ov::element::u4 || zeropType == ov::element::f32); + } + + // Fill the storage with the appropriate values + if (zeropType == ov::element::Type_t::u4) { + for (size_t i = 0; i < nElements; ++i) { + uint8_t zeropValueU4 = static_cast(zeropValues[i % zeropValues.size()]) & 0x0F; + size_t byteIndex = i / 2; + if (i % 2 == 0) { + zeropStorage[byteIndex] = zeropValueU4; + } else { + zeropStorage[byteIndex] = (zeropValueU4 << 4); + } + } + } else if (zeropType == ov::element::Type_t::f32) { + float* ptrWork = reinterpret_cast(zeropStorage.data()); + for (size_t i = 0; i < nElements; ++i) { + ptrWork[i] = zeropValues[i % zeropValues.size()]; + } + } + + // Create the tensor + zerop = ov::make_tensor(zeropType, zerop_shape, zeropStorage.data()); + } + + void make_scales() { + if (scaleType == ov::element::undefined) { + return; + } + ASSERT_TRUE(scaleType == ov::element::f16 || scaleType == ov::element::f32); + size_t nElements = shape_size(scale_shape); + + // creating custom scale factors + const size_t nScaleBytes = scaleType.bitwidth() * nElements / 8; + + std::vector sc(nElements); + float coeffTable[] = { + 0.1f, + 0.5f, + 1.f, + 2.f + }; + for (size_t i = 0; i != nElements; i++) { + sc[i] = coeffTable[i % (sizeof (coeffTable) / sizeof(*coeffTable))]; + } + scalesStorage.resize(nScaleBytes); + + if (scaleType == ov::element::f16) { + uint16_t * ptrWork = reinterpret_cast(scalesStorage.data()); + for (size_t i = 0; i != nElements; i++) { + ptrWork[i] = details::float_to_half(sc[i]); + } + } + if (scaleType == ov::element::f32) { + float* ptrWork = reinterpret_cast(scalesStorage.data()); + for (size_t i = 0; i != nElements; i++) { + ptrWork[i] = sc[i]; + } + } + scale = ov::make_tensor(scaleType, scale_shape, scalesStorage.data()); + } + + void make_input() { + + size_t nElements = shape_size(input_shape); + + ASSERT_EQ((fromType.bitwidth() * nElements) % 8, 0) << "Input len has to be byte boundary aligned, but was " + << fromType.bitwidth() * nElements << " bits"; + ASSERT_EQ((toType.bitwidth() * nElements) % 8, 0) << "Output len has to be byte boundary aligned"; + + const size_t nInputBytes = fromType.bitwidth() * nElements / 8; + const size_t nOutputBytes = toType.bitwidth() * nElements / 8; + + input.resize(nInputBytes); + ref_output.resize(nOutputBytes); + output.resize(nOutputBytes); + std::fill(ref_output.begin(), ref_output.end(), 0); + std::fill(output.begin(), output.end(), 0); + + std::array input_local = { + 0x0A, 0x0B, 0x1C, 0x1D, 0x2E, 0x2F, 0x35, 0x36, + 0x4A, 0x4B, 0x5A, 0x5B, 0x6A, 0x6B, 0x7A, 0x7B, + 0x0C, 0x0D, 0x1C, 0x1D, 0x2C, 0x2D, 0x3C, 0x3D, + 0x4C, 0x4D, 0x5C, 0x5D, 0x6C, 0x6D, 0x7C, 0x7D, + }; + + for (size_t idx = 0, k = 0; k < nInputBytes; k++, idx = (idx + 1) % input_local.size()) { + input[k] = input_local[idx]; + } + + from = ov::make_tensor(fromType, input_shape, input.data()); + to = ov::make_tensor(toType, input_shape, output.data()); + } +public: + void SetUp(const UnpackTestsParams & getParam) { + ShapesInitializer shapeInit; + + std::tie(fromType, toType, scaleType, zeropType, nPartitions, shapeInit, useParallelFor, strictPartitions) = getParam; + + std::vector input, scale, zerop; + shapeInit(input, scale, zerop); + + input_shape = ov::Shape{input.begin(), input.end()}; + scale_shape = ov::Shape{scale.begin(), scale.end()}; + if (zerop.empty()) { + zerop_shape = ov::Shape({1}); + } else { + zerop_shape = ov::Shape{zerop.begin(), zerop.end()}; + } + + make_input(); + make_scales(); + make_zeropoints(); + + make_ref_output(); + } + std::string ToString() const { + std::ostringstream result; + result << (isNegative() ? "NEGATIVE_" : "") + <<"["; + + for (size_t i = 0; i != input_shape.size(); i++) { + result << input_shape[i] << ((i + 1 == input_shape.size()) ? "" : "x"); + } + result <<"]" + << "_p" << nPartitions + << (strictPartitions ? "_SP" : "") + << (useParallelFor ? "_parallel" : "_serial") + << "_from_" << fromType + << "_to_" << toType; + if (scaleType != ov::element::Type_t::undefined) + result << "_scale_" << scaleType; + if (zeropType != ov::element::Type_t::undefined) + result << "_zerop_" << zeropType; + + return result.str(); + } + + /** + * Negative test cases has to be carefully reviewed, to still remain positive runs at some points + * @return + */ + virtual bool isNegative() const { + return false; + } + + virtual void make_ref_output() { + size_t nElements = 1; + for (size_t dim : input_shape) { + nElements *= dim; + } + if (toType == ov::element::i8) { + details::unpack(input.data(), ref_output.data(), static_cast(nElements)); + } else if (toType == ov::element::f16) { + details::unpack_i4f16(input.data(), ref_output.data(), static_cast(nElements)); + } + } +}; + +template +class UnpackTestsTmpl : + public ::testing::Test, + public T, + public ::testing::WithParamInterface { +protected: + + void SetUp() override { + T::SetUp(GetParam()); + } +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + T _bt; + _bt.SetUp(obj.param); + return _bt.ToString(); + } +}; + +using UnpackTests = UnpackTestsTmpl; +class UnpackTestsRef : public UnpackTests {}; + +TEST_P(UnpackTests, i4) { + ASSERT_NO_THROW_WITH_MESSAGE(ov::npuw::util::unpack(from, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input)); +} + +class UnpackWithScaleTestsBase : public UnpackTestsBase { +protected: + bool isNegative() const override { + if (scale_shape.size() != 3 && scale_shape.size() != 2) return true; + if (input_shape.back() % 64) return true; + if ((from->get_size() / scale->get_size()) % 64) return true; + if (toType != ov::element::f16) return true; + + return false; + } + + void make_ref_output() override { + if (isNegative()) return; + + size_t nElements = from->get_size(); + + const size_t nOutputElementsPerScale = ref_output.size() / (toType.bitwidth() / 8) / scale->get_size(); + + details::unpack_i4f16(input.data(), ref_output.data(), static_cast(nElements)); + + // lets apply per channel scale + uint16_t * pRef = reinterpret_cast(ref_output.data()); + uint16_t * pScale_f16 = reinterpret_cast(scale->data()); + float * pScale_f32 = reinterpret_cast(scale->data()); + + for (size_t i = 0; i < scale->get_size(); i++) { + for (size_t sc = 0; sc != nOutputElementsPerScale; sc++) { + float ref_scaled = details::half_to_float(pRef[0]); + if (scaleType == ov::element::f32) { + ref_scaled *= pScale_f32[0]; + } else if (scaleType == ov::element::f16) { + ref_scaled *= details::half_to_float(pScale_f16[0]); + } + *pRef = details::float_to_half(ref_scaled); + pRef++; + } + pScale_f32++; + pScale_f16++; + } + } + +}; + +using UnpackWithScaleTests = UnpackTestsTmpl; + + +TEST_P(UnpackWithScaleTests, i4_scale) { + ASSERT_NO_THROW_IF(!isNegative(), + ov::npuw::util::unpack(from, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + if (!isNegative()) { + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input)); + } +} + + +class UnpackTestsWithScaleAndZeroPointBase : public UnpackTestsBase { +protected: + bool isNegative() const override { + if (scale_shape.size() != 3 && scale_shape.size() != 2) return true; + if (input_shape.back() % 64) return true; + + return false; + } + + void make_ref_output() override { + if (isNegative()) return; + + size_t nElements = from->get_size(); + + const size_t nOutputElementsPerScale = ref_output.size() / (toType.bitwidth() / 8) / scale->get_size(); + + std::vector floatRef(nElements); + details::unpack_u4f32(input.data(), floatRef.data(), static_cast(nElements)); + + + // lets apply per channel scale + uint16_t * pRef = reinterpret_cast(ref_output.data()); + float * pFloatRef = reinterpret_cast(floatRef.data()); + const uint16_t * pScale_f16 = reinterpret_cast(scale->data()); + const float * pScale_f32 = reinterpret_cast(scale->data()); + + for (size_t i = 0; i < scale->get_size(); i++) { + for (size_t sc = 0; sc != nOutputElementsPerScale; sc++) { + // applying zeropoint + float ref_scaled = *pFloatRef - zeropValue; + + if (scaleType == ov::element::f32) { + ref_scaled *= pScale_f32[0]; + } else if (scaleType == ov::element::f16) { + ref_scaled *= details::half_to_float(pScale_f16[0]); + } + *pRef = details::float_to_half(ref_scaled); + + pFloatRef++; + pRef++; + } + pScale_f32++; + pScale_f16++; + } + } +}; + +using UnpackTestsWithScaleAndZeroPoint = UnpackTestsTmpl; + +TEST_P(UnpackTestsWithScaleAndZeroPoint, u4) { + ASSERT_NO_THROW_IF(!isNegative(), + ov::npuw::util::unpack(from, zerop, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + if (!isNegative()) { + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input, false)); + } +} + +class UnpackTestsWithScaleAndZeroPoint2 : public UnpackTestsWithScaleAndZeroPointBase { +protected: + bool isNegative() const override { + if (input_shape.back() % 64 || input_shape.size() != 3) return true; + if (scale_shape.back() % 64 || scale_shape.size() != 3) return true; + + return false; + } + + void make_ref_output() override { + if (isNegative()) return; + + size_t nElements = from->get_size(); + const auto from_shape = from->get_shape(); + + const size_t C = from_shape[from_shape.size() - 3]; + const size_t H = from_shape[from_shape.size() - 2]; + const size_t W = from_shape[from_shape.size() - 1]; + + std::vector floatRef(nElements); + details::unpack_u4f32(input.data(), floatRef.data(), static_cast(nElements)); + + uint16_t * pRef = reinterpret_cast(ref_output.data()); + float * pFloatRef = reinterpret_cast(floatRef.data()); + const uint16_t * pScale_f16 = reinterpret_cast(scale->data()); + const float * pScale_f32 = reinterpret_cast(scale->data()); + + for (size_t c = 0; c < C; ++c) { + for (size_t h = 0; h < H; ++h) { + for (size_t w = 0; w < W; ++w) { + size_t input_index = w + W * h + W * H * c; + size_t scale_index = w + W * c; + float ref_scaled = pFloatRef[input_index] - zeropValue; + if (scaleType == ov::element::f32) { + ref_scaled *= pScale_f32[scale_index]; + } else if (scaleType == ov::element::f16) { + ref_scaled *= details::half_to_float(pScale_f16[scale_index]); + } + pRef[w + W * h + c * W * H] = details::float_to_half(ref_scaled); + } + } + } + } +}; + +using UnpackTestsWithScaleAndZeroPointTest2 = UnpackTestsTmpl; + +TEST_P(UnpackTestsWithScaleAndZeroPointTest2, u4) { + ASSERT_NO_THROW_IF(!isNegative(), + ov::npuw::util::unpack(from, zerop, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + if (!isNegative()) { + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input, false)); + } +} + +class UnpackTestsWithScaleAndZeroPoint3 : public UnpackTestsWithScaleAndZeroPointBase { +protected: + bool isNegative() const override { + if (scale_shape.size() != 3 || zerop_shape.size() != 3) return true; + if (input_shape[2] % 64 || input_shape.size() != 3) return true; + + return false; + } + + void make_ref_output() override { + if (isNegative()) return; + + size_t nElements = from->get_size(); + + const size_t nOutputElementsPerScale = ref_output.size() / (toType.bitwidth() / 8) / scale->get_size(); + + std::vector floatRef(nElements); + details::unpack_u4f32(input.data(), floatRef.data(), static_cast(nElements)); + + + // lets apply per channel scale + uint16_t * pRef = reinterpret_cast(ref_output.data()); + const uint8_t* pZer = static_cast(zerop->data()); + float * pFloatRef = reinterpret_cast(floatRef.data()); + const uint16_t * pScale_f16 = reinterpret_cast(scale->data()); + const float * pScale_f32 = reinterpret_cast(scale->data()); + + for (size_t i = 0; i < scale->get_size(); i++) { + float zeroPointValue = static_cast((i % 2 == 0) ? details::lo4(pZer[i / 2]) : details::hi4(pZer[i / 2])); + for (size_t sc = 0; sc != nOutputElementsPerScale; sc++) { + // applying zeropoint + float ref_scaled = *pFloatRef - zeroPointValue; + + if (scaleType == ov::element::f32) { + ref_scaled *= pScale_f32[0]; + } else if (scaleType == ov::element::f16) { + ref_scaled *= details::half_to_float(pScale_f16[0]); + } + *pRef = details::float_to_half(ref_scaled); + + pFloatRef++; + pRef++; + } + pScale_f32++; + pScale_f16++; + } + } +}; + +using UnpackTestsWithScaleAndZeroPointTest3 = UnpackTestsTmpl; + +TEST_P(UnpackTestsWithScaleAndZeroPointTest3, u4) { + ASSERT_NO_THROW_IF(!isNegative(), + ov::npuw::util::unpack(from, zerop, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + if (!isNegative()) { + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input, false)); + } +} + +#define Tensors [](std::vector& input, std::vector&scale, std::vector&zerop) + + +namespace details { +::testing::internal::ParamGenerator::value_type> ShapesIn( + const std::vector& container) { + return ::testing::ValuesIn(container.begin(), container.end()); +} + +} // namespace details +} // anonymous namespace