diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp index 2ce67a71a5da57..df15feb46d3c44 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp @@ -55,4 +55,5 @@ DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime); DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime); DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime); DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime); +DEFINE_OPT(NPUW_TRANSPOSE_WEIGHTS, bool, false, npuw::partitioning::transpose_weights, CompileTime); } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp index 627b6b957ebfb3..4ddbbc501decdc 100644 --- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp @@ -177,6 +177,15 @@ static constexpr ov::Property dcoff_type{"NPUW_DCOFF_TYPE"}; */ static constexpr ov::Property dcoff_with_scale{"NPUW_DCOFF_SCALE"}; +/** + * @brief + * Type: bool. + * Transpose input weight and the corresponding scale and zero tensors (if any) before passing as inputs, if required. + * Works with function "NPUW_FOLD"ing. + * Default value: false. + */ +static constexpr ov::Property transpose_weights{"NPUW_TRANSPOSE_WEIGHTS"}; + /** * @brief * Type: bool. diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index a43794e883368c..21da949d1b97b7 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -35,6 +35,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); #ifdef NPU_PLUGIN_DEVELOPER_BUILD desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 6cfb7fd7ce1dc7..5efadc8cb5d4b0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -800,6 +800,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL), BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE), BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE), + BIND(npuw::partitioning::transpose_weights, NPUW_TRANSPOSE_WEIGHTS), BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE), BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC), BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK), diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 1325fc1f1d1dd0..b6a6ef8c69195f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1606,32 +1606,55 @@ void Partitioner::decompressionCutOff(const std::string& func_name) { { LOG_BLOCK(); + const bool enable_transpose = cfg.get<::intel_npu::NPUW_TRANSPOSE_WEIGHTS>(); + ov::npuw::patterns::DCOFFParams params_to; ov::pass::GraphRewrite rewr; + // Old LLaMa-v2 patterns (Symmetric) - rewr.add_matcher(dcoff_mode, dcoff_type, std::ref(params_to)) + rewr.add_matcher(dcoff_mode, + dcoff_type, + enable_transpose, + std::ref(params_to)) ->build(); - rewr.add_matcher(dcoff_mode, dcoff_type, std::ref(params_to)) + rewr.add_matcher(dcoff_mode, + dcoff_type, + enable_transpose, + std::ref(params_to)) ->build(); // ChatGLM (GPTQ) and New LLaMa-v2 patterns (Symmetric) - rewr.add_matcher(dcoff_mode, dcoff_type, std::ref(params_to)) + rewr.add_matcher(dcoff_mode, + dcoff_type, + enable_transpose, + std::ref(params_to)) ->build(); - rewr.add_matcher(dcoff_mode, dcoff_type, std::ref(params_to)) + rewr.add_matcher(dcoff_mode, + dcoff_type, + enable_transpose, + std::ref(params_to)) ->build(); // LLaMaGPTQ - rewr.add_matcher(dcoff_mode, dcoff_type, std::ref(params_to)); + rewr.add_matcher(dcoff_mode, + dcoff_type, + enable_transpose, + std::ref(params_to)); // Phi-3 4SymW16A/GPTQ - rewr.add_matcher(dcoff_mode, dcoff_type, std::ref(params_to)); + rewr.add_matcher(dcoff_mode, + dcoff_type, + enable_transpose, + std::ref(params_to)); // Asymmetric zeropoints - rewr.add_matcher(dcoff_mode, dcoff_type, std::ref(params_to)); + rewr.add_matcher(dcoff_mode, + dcoff_type, + enable_transpose, + std::ref(params_to)); rewr.run_on_model(f._model); - ov::pass::Validate val; val.run_on_model(f._model); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp index 857bcd9c93ba56..d5cd2fccf9b94a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp @@ -21,6 +21,295 @@ namespace ov { namespace npuw { +namespace pattern_utils { + +std::shared_ptr get_root_matmul(ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + auto start_node = node_to_output.begin()->second.get_node_shared_ptr(); + std::shared_ptr current_node = start_node; + while (current_node) { + // Check if the current node is a MatMul + if (auto matmul = std::dynamic_pointer_cast(current_node)) { + return matmul; + } + // Move to the next node in the path if there is one + if (!current_node->outputs().empty()) { + auto output = current_node->outputs().at(0); + if (!output.get_target_inputs().empty()) { + current_node = output.get_target_inputs().begin()->get_node()->shared_from_this(); + } else { + // No further outputs, end the search + break; + } + } else { + // No outputs, end the search + break; + } + } + return NULL; +} + +bool transpose_required(const std::shared_ptr& matmul_node) { + NPUW_ASSERT(matmul_node); + LOG_DEBUG("Checking the Matmul Node: " << matmul_node); + const auto& input_shape = matmul_node->input_value(1).get_shape(); + const auto& output_shape = matmul_node->output(0).get_shape(); + + if (output_shape.back() != input_shape.front()) { + return true; // Transpose is required + } + if (input_shape.size() == 2 && input_shape[0] == input_shape[1] && !matmul_node->get_transpose_b()){ + return true; + } + return false; +} + +void transpose_param_shape(std::shared_ptr& param) { + auto partial_shape = param->get_partial_shape(); + NPUW_ASSERT(partial_shape.is_static()); + auto shape = partial_shape.to_shape(); + LOG_DEBUG("Transposing the param: " << param); + + // Check if the shape is 2D or 3D + if (shape.size() == 2) { + // For 2D shapes, swap the dimensions + ov::Shape new_shape{shape[1], shape[0]}; + shape = new_shape; + } else if (shape.size() == 3) { + // For 3D shapes, bring the last dimension to the front + ov::Shape new_shape{shape[2], shape[0], shape[1]}; + shape = new_shape; // Assign the new shape to the original shape variable + } + + // Set the new shape to the parameter + param->set_partial_shape(ov::PartialShape(shape)); + param->validate_and_infer_types(); + LOG_DEBUG("Modifying the shape to: " << param); +} + +inline uint8_t lo4(uint8_t x) { + return x & 0x0F; +} + +inline uint8_t hi4(uint8_t x) { + return x >> 4; +} + +inline uint8_t tread_4b(const ov::Tensor& t, std::size_t index) { + const uint8_t* data = static_cast(t.data()); + if (index % 2 == 0) { + return lo4(data[index / 2]); + } else { + return hi4(data[index / 2]); + } +} + +inline void twrite_4b(ov::Tensor& t, uint8_t value, std::size_t index) { + uint8_t* data = static_cast(t.data()); + if (index % 2 == 0) { + data[index / 2] = (data[index / 2] & 0xF0) | (value & 0x0F); + } else { + data[index / 2] = (data[index / 2] & 0x0F) | (value << 4); + } +} + +ov::Tensor transpose_u4(const ov::Tensor& tensor) { + const auto& shape = tensor.get_shape(); + NPUW_ASSERT(shape.size() == 2 || shape.size() == 3); + + ov::Shape transposed_shape; + + if (shape.size() == 2) { + // For a 2D tensor with shape KxM, transpose to MxK + size_t K = shape[0]; + size_t M = shape[1]; + transposed_shape = {M, K}; + + ov::Tensor transposed_tensor(ov::element::u4, transposed_shape); + + for (size_t k = 0; k < K; ++k) { + for (size_t m = 0; m < M; ++m) { + uint8_t value = tread_4b(tensor, k * M + m); + twrite_4b(transposed_tensor, value, m * K + k); + } + } + + return transposed_tensor; + } else if (shape.size() == 3) { + // For a 3D tensor with shape KxMxN, transpose to NxKxM + size_t K = shape[0]; + size_t M = shape[1]; + size_t N = shape[2]; + transposed_shape = {N, K, M}; + + ov::Tensor transposed_tensor(ov::element::u4, transposed_shape); + + for (size_t k = 0; k < K; ++k) { + for (size_t m = 0; m < M; ++m) { + for (size_t n = 0; n < N; ++n) { + uint8_t value = tread_4b(tensor, k * (M * N) + m * N + n); + twrite_4b(transposed_tensor, value, n * (K * M) + k * M + m); + } + } + } + + return transposed_tensor; + } + + // Invalid case + NPUW_ASSERT(false); +} + +ov::Tensor transpose_i4(const ov::Tensor& tensor) { + const auto& shape = tensor.get_shape(); + NPUW_ASSERT(shape.size() == 2 || shape.size() == 3); + + ov::Shape transposed_shape; + + if (shape.size() == 2) { + // For a 2D tensor with shape KxM, transpose to MxK + size_t K = shape[0]; + size_t M = shape[1]; + transposed_shape = {M, K}; + + ov::Tensor transposed_tensor(ov::element::i4, transposed_shape); + + for (size_t k = 0; k < K; ++k) { + for (size_t m = 0; m < M; ++m) { + uint8_t value = tread_4b(tensor, k * M + m); + twrite_4b(transposed_tensor, value, m * K + k); + } + } + + return transposed_tensor; + } else if (shape.size() == 3) { + // For a 3D tensor with shape KxMxN, transpose to NxKxM + size_t K = shape[0]; + size_t M = shape[1]; + size_t N = shape[2]; + transposed_shape = {N, K, M}; + + ov::Tensor transposed_tensor(ov::element::i4, transposed_shape); + + for (size_t k = 0; k < K; ++k) { + for (size_t m = 0; m < M; ++m) { + for (size_t n = 0; n < N; ++n) { + uint8_t value = tread_4b(tensor, k * (M * N) + m * N + n); + twrite_4b(transposed_tensor, value, n * (K * M) + k * M + m); + } + } + } + + return transposed_tensor; + } + + // Invalid case + NPUW_ASSERT(false); +} + +ov::Tensor transpose_f16(const ov::Tensor& tensor) { + const auto& shape = tensor.get_shape(); + NPUW_ASSERT(shape.size() == 2 || shape.size() == 3); + + const int16_t* data = reinterpret_cast(tensor.data()); + + if (shape.size() == 2) { + // For a 2D tensor with shape KxM, transpose to MxK + size_t K = shape[0]; + size_t M = shape[1]; + + ov::Tensor transposed_tensor(ov::element::f16, {M, K}); + int16_t* transposed_data = reinterpret_cast(transposed_tensor.data()); + + for (size_t k = 0; k < K; ++k) { + for (size_t m = 0; m < M; ++m) { + transposed_data[m * K + k] = data[k * M + m]; + } + } + return transposed_tensor; + } else if (shape.size() == 3) { + // For a 3D tensor with shape KxMxN, transpose to NxKxM + size_t K = shape[0]; + size_t M = shape[1]; + size_t N = shape[2]; + + ov::Tensor transposed_tensor(ov::element::f16, {N, K, M}); + int16_t* transposed_data = reinterpret_cast(transposed_tensor.data()); + + for (size_t n = 0; n < N; ++n) { + for (size_t k = 0; k < K; ++k) { + for (size_t m = 0; m < M; ++m) { + transposed_data[n * (K * M) + k * M + m] = data[k * (M * N) + m * N + n]; + } + } + } + return transposed_tensor; + } + + // Invalid case + NPUW_ASSERT(false); +} + +ov::Tensor transpose_f32(const ov::Tensor& tensor) { + const auto& shape = tensor.get_shape(); + NPUW_ASSERT(shape.size() == 2 || shape.size() == 3); + + const float* data = tensor.data(); + + if (shape.size() == 2) { + // For a 2D tensor with shape KxM, transpose to MxK + size_t K = shape[0]; + size_t M = shape[1]; + + ov::Tensor transposed_tensor(ov::element::f32, {M, K}); + float* transposed_data = transposed_tensor.data(); + + for (size_t k = 0; k < K; ++k) { + for (size_t m = 0; m < M; ++m) { + transposed_data[m * K + k] = data[k * M + m]; + } + } + return transposed_tensor; + } else if (shape.size() == 3) { + // For a 3D tensor with shape KxMxN, transpose to NxKxM + size_t K = shape[0]; + size_t M = shape[1]; + size_t N = shape[2]; + + ov::Tensor transposed_tensor(ov::element::f32, {N, K, M}); + float* transposed_data = transposed_tensor.data(); + + for (size_t n = 0; n < N; ++n) { + for (size_t k = 0; k < K; ++k) { + for (size_t m = 0; m < M; ++m) { + transposed_data[n * (K * M) + k * M + m] = data[k * (M * N) + m * N + n]; + } + } + } + return transposed_tensor; + } + + // Invalid case + NPUW_ASSERT(false); +} + +ov::Tensor transpose_tensor(const ov::Tensor& tensor) { + switch (tensor.get_element_type()) { + case ov::element::u4: + return transpose_u4(tensor); + case ov::element::i4: + return transpose_i4(tensor); + case ov::element::f16: + return transpose_f16(tensor); + case ov::element::f32: + return transpose_f32(tensor); + default: + NPUW_ASSERT(false); + } +} + +} // namespace pattern_utils + namespace patterns { namespace opp = ov::pass::pattern; @@ -73,6 +362,10 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) { LOG_DEBUG("Checking the function parameter " << param); LOG_BLOCK(); + if (params_to.transpose_required.find(param) != params_to.transpose_required.end()) { + m.transpose_indices.push_back(i - fbody._param_offset); + } + // First find among scale factors... auto pscale_iter = params_to.scales.find(param); auto pzerop_iter = params_to.zerops_asymm.find(param); @@ -114,23 +407,60 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) { } void apply_remap(Subgraph& fcall, const ClosureRemap& m) { - std::vector new_closure; - std::vector new_scales; - std::vector new_zerops; - - // For a new_closure vector by rearranging the old one. Also - // reserve a new_scales vector to have the same size, filled with - // empty tensors by default. - for (auto&& i : m.closure_remap) { - new_closure.push_back(fcall._closure[i]); - - auto scale_iter = m.scale_remap.find(i); - new_scales.push_back(scale_iter != m.scale_remap.end() ? fcall._closure[scale_iter->second] : ov::Tensor()); - // Check for asymmetric zero points and add them to new_zerops - auto zerop_iter = m.zerop_remap.find(i); - const auto& zerop = zerop_iter != m.zerop_remap.end() ? fcall._closure[zerop_iter->second] : m.zero_points[i]; - new_zerops.push_back(zerop); + std::vector new_closure(m.closure_remap.size()); + std::vector new_scales(m.closure_remap.size()); + std::vector new_zerops(m.closure_remap.size()); + + // Create mappings from original indices to new indices + std::unordered_map closure_to_new_index; + std::unordered_map scale_to_new_index; + std::unordered_map zerop_to_new_index; + + // Fill the new vectors and create the mappings + for (std::size_t i = 0; i < m.closure_remap.size(); ++i) { + auto orig_index = m.closure_remap[i]; + new_closure[i] = fcall._closure[orig_index]; + closure_to_new_index[orig_index] = i; + + // Handle Scale remap + auto scale_iter = m.scale_remap.find(orig_index); + if (scale_iter != m.scale_remap.end()) { + new_scales[i] = fcall._closure[scale_iter->second]; + scale_to_new_index[scale_iter->second] = i; + } + + // Handle Zerop remap + auto zerop_iter = m.zerop_remap.find(orig_index); + if (zerop_iter != m.zerop_remap.end()) { + new_zerops[i] = fcall._closure[zerop_iter->second]; + zerop_to_new_index[zerop_iter->second] = i; + } else { + new_zerops[i] = m.zero_points[orig_index]; + } } + + // Transpose the concerned tensors + for (auto&& orig_index : m.transpose_indices) { + // Transpose closure tensors if needed + auto closure_it = closure_to_new_index.find(orig_index); + if (closure_it != closure_to_new_index.end()) { + new_closure[closure_it->second] = pattern_utils::transpose_tensor(new_closure[closure_it->second]); + } + + // Transpose scale tensors if needed + auto scale_it = scale_to_new_index.find(orig_index); + if (scale_it != scale_to_new_index.end()) { + new_scales[scale_it->second] = pattern_utils::transpose_tensor(new_scales[scale_it->second]); + } + + // Transpose zerop tensors if needed + auto zerop_it = zerop_to_new_index.find(orig_index); + if (zerop_it != zerop_to_new_index.end()) { + new_zerops[zerop_it->second] = pattern_utils::transpose_tensor(new_zerops[zerop_it->second]); + } + } + + // Update the Subgraph with the new vectors fcall._closure = std::move(new_closure); fcall._scales = std::move(new_scales); fcall._zerops = std::move(new_zerops); @@ -192,10 +522,14 @@ void finalize_remap(Function& fbody, const ClosureRemap& m) { // its Parameter B). namespace SymmNoZP { -DCOFFPassBase::DCOFFPassBase(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref) +DCOFFPassBase::DCOFFPassBase(DCOffMode dcoff_mode, + ov::element::Type dcoff_type, + bool enable_transpose, + DCOFFParamRef pref) : m_dcoff_mode(dcoff_mode), m_dcoff_type(dcoff_type), - m_params_to(pref) {} + m_params_to(pref), + m_enable_transpose(enable_transpose) {} void DCOFFPassBase::build() { paramA = opp::wrap_type(); @@ -327,9 +661,13 @@ namespace SymmZP { // V > // -DCOFFPassBase::DCOFFPassBase(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref) +DCOFFPassBase::DCOFFPassBase(DCOffMode dcoff_mode, + ov::element::Type dcoff_type, + bool enable_transpose, + DCOFFParamRef pref) : m_dcoff_mode(dcoff_mode), m_dcoff_type(dcoff_type), + m_enable_transpose(enable_transpose), m_params_to(pref) {} void DCOFFPassBase::build() { @@ -464,7 +802,10 @@ void DCOFFPassConvert1::reconnect_root(ov::pass::pattern::Matcher& m) { // V > // -DCOFFPassReshape2::DCOFFPassReshape2(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref) { +DCOFFPassReshape2::DCOFFPassReshape2(DCOffMode dcoff_mode, + ov::element::Type dcoff_type, + bool enable_transpose, + DCOFFParamRef pref) { auto paramA = opp::wrap_type(); auto constB = opp::wrap_type(); auto paramC = opp::wrap_type(); @@ -495,6 +836,16 @@ DCOFFPassReshape2::DCOFFPassReshape2(DCOffMode dcoff_mode, ov::element::Type dco LOG_DEBUG("Matched: " << matched_paramA << ", set element type to " << dcoff_type); matched_paramA->set_element_type(dcoff_type); + auto matched_MM = pattern_utils::get_root_matmul(m); + const bool need_transpose = pattern_utils::transpose_required(matched_MM); + if (enable_transpose && need_transpose) { + pref.get().transpose_required.insert(matched_paramA); + pref.get().transpose_required.insert(matched_paramC); + pattern_utils::transpose_param_shape(matched_paramA); + pattern_utils::transpose_param_shape(matched_paramC); + matched_MM->set_transpose_b(true); + } + if (dcoff_mode == DCOffMode::CAST_SCALE) { NPUW_ASSERT(dcoff_type == ov::element::f16); @@ -527,7 +878,23 @@ DCOFFPassReshape2::DCOFFPassReshape2(DCOffMode dcoff_mode, ov::element::Type dco LOG_DEBUG("Reconnecting the Root..."); auto matched_reshpe = node_to_output.at(reshpe).get_node_shared_ptr(); + LOG_DEBUG(matched_reshpe); + if (enable_transpose && need_transpose) { + auto input_tensor = matched_reshpe->input_value(0); + auto matched_param_shape = matched_paramA->get_shape(); + + std::vector new_shape_pattern = {static_cast(matched_param_shape[0]), -1}; + + auto new_shape_pattern_node = std::make_shared( + ov::element::i32, ov::Shape{2}, new_shape_pattern); + + // Create a new Reshape node with the input tensor and the new shape pattern + auto new_reshape_node = std::make_shared( + input_tensor, new_shape_pattern_node, false); + ov::replace_node(matched_reshpe, new_reshape_node); + } matched_reshpe->input(0).replace_source_output(matched_convrt); + LOG_DEBUG(matched_MM); } LOG_DEBUG("Done"); } @@ -555,7 +922,10 @@ DCOFFPassReshape2::DCOFFPassReshape2(DCOffMode dcoff_mode, ov::element::Type dco // V > // Convert -DCOFFPassCWAI3::DCOFFPassCWAI3(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref) { +DCOFFPassReshape3::DCOFFPassReshape3(DCOffMode dcoff_mode, + ov::element::Type dcoff_type, + bool enable_transpose, + DCOFFParamRef pref) { auto paramA = opp::wrap_type(); auto paramC = opp::wrap_type(); auto cvtA = opp::wrap_type({paramA}); @@ -847,7 +1217,10 @@ namespace AsymmZP { // : > // V > // -DCOFFPassReshape::DCOFFPassReshape(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref) { +DCOFFPassReshape::DCOFFPassReshape(DCOffMode dcoff_mode, + ov::element::Type dcoff_type, + bool enable_transpose, + DCOFFParamRef pref) { auto paramA = opp::wrap_type(); auto paramB = opp::wrap_type(); auto paramC = opp::wrap_type(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp index c0b394616c6ed5..cd90b74301a12a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp @@ -31,12 +31,14 @@ struct DCOFFParams { std::unordered_map scales; // Closures: a scaling factor -> orig tensor std::unordered_map zerops; // Closures: orig tensor -> a zero point (yes, a reverse...) std::unordered_map zerops_asymm; // Closures: orig tensor -> an asymmetric zerop parameter + std::unordered_set transpose_required; // Parameters that need to be transposed }; using DCOFFParamRef = std::reference_wrapper; struct ClosureRemap { std::vector closure_remap; // [new closure index] -> orig closure idx + std::vector transpose_indices; // Indices of closure tensors that require transposition std::map scale_remap; // orig closure idx -> orig scale idx std::map zerop_remap; // orig closure idx -> orig asymm zero point idx ov::ParameterVector params_to_remove; @@ -57,12 +59,13 @@ class DCOFFPassBase : public ov::pass::MatcherPass { DCOffMode m_dcoff_mode = DCOffMode::CAST_ONLY; ov::element::Type m_dcoff_type; DCOFFParamRef m_params_to; + bool m_enable_transpose; std::shared_ptr paramA, paramB, toFP32, mulply; bool matcher_callback(ov::pass::pattern::Matcher& m); public: - DCOFFPassBase(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref); + DCOFFPassBase(DCOffMode dcoff_mode, ov::element::Type dcoff_type, bool enable_transpose, DCOFFParamRef pref); virtual void build(); virtual void reconnect_root_to_convert(ov::pass::pattern::Matcher& m) = 0; @@ -97,12 +100,13 @@ class DCOFFPassBase : public ov::pass::MatcherPass { DCOffMode m_dcoff_mode = DCOffMode::CAST_ONLY; ov::element::Type m_dcoff_type; DCOFFParamRef m_params_to; + bool m_enable_transpose; std::shared_ptr paramA, constB, paramC, cvtA, cvtB, subtr, mulply; bool matcher_callback(ov::pass::pattern::Matcher& m); public: - DCOFFPassBase(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref); + DCOFFPassBase(DCOffMode dcoff_mode, ov::element::Type dcoff_type, bool enable_transpose, DCOFFParamRef pref); virtual void build(); virtual void reconnect_root(ov::pass::pattern::Matcher& m) = 0; @@ -128,12 +132,12 @@ class DCOFFPassConvert1 final : public DCOFFPassBase { class DCOFFPassReshape2 : public ov::pass::MatcherPass { public: - DCOFFPassReshape2(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref); + DCOFFPassReshape2(DCOffMode dcoff_mode, ov::element::Type dcoff_type, bool enable_transpose, DCOFFParamRef pref); }; -class DCOFFPassCWAI3 : public ov::pass::MatcherPass { +class DCOFFPassReshape3 : public ov::pass::MatcherPass { public: - DCOFFPassCWAI3(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref); + DCOFFPassReshape3(DCOffMode dcoff_mode, ov::element::Type dcoff_type, bool enable_transpose, DCOFFParamRef pref); }; class CWAI1 : public ov::pass::MatcherPass { @@ -165,7 +169,7 @@ class CWAI3 : public ov::pass::MatcherPass { namespace AsymmZP { class DCOFFPassReshape : public ov::pass::MatcherPass { public: - DCOFFPassReshape(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref); + DCOFFPassReshape(DCOffMode dcoff_mode, ov::element::Type dcoff_type, bool enable_transpose, DCOFFParamRef pref); }; } // namespace AsymmZP