From 305ff09458cf257fd38aa7997d7f7de9aedb5286 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 26 Sep 2024 15:47:05 +0400 Subject: [PATCH] [Snippets][CPU] Added BrgemmCopyA op --- src/common/snippets/src/op/subgraph.cpp | 1 + .../emitters/snippets/x64/cpu_generator.cpp | 9 +- .../x64/jit_brgemm_copy_a_emitter.cpp | 90 +++++++ .../x64/jit_brgemm_copy_a_emitter.hpp | 41 ++++ .../x64/jit_brgemm_copy_b_emitter.cpp | 10 +- .../snippets/x64/jit_brgemm_emitter.cpp | 37 ++- .../snippets/x64/kernel_executors/brgemm.cpp | 22 +- .../x64/kernel_executors/brgemm_copy_a.cpp | 225 ++++++++++++++++++ .../x64/kernel_executors/brgemm_copy_a.hpp | 104 ++++++++ .../x64/kernel_executors/brgemm_copy_b.cpp | 4 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 2 +- .../snippets/x64/op/brgemm_copy_a.cpp | 63 +++++ .../snippets/x64/op/brgemm_copy_a.hpp | 52 ++++ .../snippets/x64/op/brgemm_copy_b.cpp | 41 ++-- .../snippets/x64/op/brgemm_copy_b.hpp | 10 +- .../snippets/x64/op/brgemm_cpu.cpp | 70 +++--- .../snippets/x64/op/brgemm_cpu.hpp | 18 +- .../snippets/x64/op/brgemm_utils.cpp | 97 ++++---- .../snippets/x64/op/brgemm_utils.hpp | 58 ++--- .../x64/pass/brgemm_to_brgemm_cpu.cpp | 111 +++++---- .../x64/pass/lowered/brgemm_cpu_blocking.cpp | 66 +++-- .../x64/pass/lowered/brgemm_cpu_blocking.hpp | 4 + ...cpp => brgemm_copy_buffer_expressions.cpp} | 29 ++- ...hpp => brgemm_copy_buffer_expressions.hpp} | 15 ++ .../lowered/insert_brgemm_copy_b_buffers.cpp | 36 +-- .../snippets/x64/shape_inference.cpp | 2 + .../skip_tests_config.cpp | 44 ++-- .../x64/lowered/brgemm_blocking.cpp | 56 +++-- .../x64/lowered/buffer_allocation.cpp | 21 +- .../common_test_utils/src/ov_tensor_utils.cpp | 4 +- 30 files changed, 1020 insertions(+), 322 deletions(-) create mode 100644 src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_a_emitter.cpp create mode 100644 src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_a_emitter.hpp create mode 100644 src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_a.cpp create mode 100644 src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_a.hpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_a.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_a.hpp rename src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/{brgemm_copy_b_buffer_expressions.cpp => brgemm_copy_buffer_expressions.cpp} (75%) rename src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/{brgemm_copy_b_buffer_expressions.hpp => brgemm_copy_buffer_expressions.hpp} (69%) diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 98e3392a65e1e2..01aba2baa1cc3d 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -428,6 +428,7 @@ void Subgraph::data_flow_transformations(const BlockedShapeVector& blocked_input manager.register_pass(); manager.register_positioned_passes(backend_passes); + manager.register_pass("body.xml", "body.bin"); manager.run_passes(body_ptr()); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index 2cfd6e714e1dd8..e987bd90e8afdb 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -7,6 +7,7 @@ #include "snippets/snippets_isa.hpp" #include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "emitters/snippets/x64/jit_brgemm_copy_a_emitter.hpp" #include "emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp" #include "emitters/snippets/x64/jit_brgemm_emitter.hpp" #include "emitters/snippets/x64/jit_memory_emitters.hpp" @@ -23,6 +24,7 @@ #include "transformations/snippets/common/op/load_convert.hpp" #include "transformations/snippets/common/op/store_convert.hpp" #include "transformations/snippets/common/op/fused_mul_add.hpp" +#include "transformations/snippets/x64/op/brgemm_copy_a.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" @@ -243,6 +245,9 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_emitter, configurator->get_kernel_executor_table(), compiled_kernel_cache); + jitters[intel_cpu::BrgemmCopyA::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_copy_a_emitter, + configurator->get_kernel_executor_table(), + compiled_kernel_cache); jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_copy_b_emitter, configurator->get_kernel_executor_table(), compiled_kernel_cache); @@ -356,6 +361,7 @@ ov::snippets::RegType intel_cpu::CPUGenerator::get_specific_op_out_reg_type(cons std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || #endif + std::dynamic_pointer_cast(op)|| std::dynamic_pointer_cast(op)) return ov::snippets::RegType::gpr; else if ( @@ -368,7 +374,8 @@ ov::snippets::RegType intel_cpu::CPUGenerator::get_specific_op_out_reg_type(cons bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr& e) const { bool need = std::dynamic_pointer_cast(e) || - std::dynamic_pointer_cast(e); + std::dynamic_pointer_cast(e) || + std::dynamic_pointer_cast(e); #ifdef SNIPPETS_DEBUG_CAPS const auto cpu_target_machine = std::dynamic_pointer_cast(target); need = need || (cpu_target_machine && cpu_target_machine->debug_config.enable_segfault_detector) || diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_a_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_a_emitter.cpp new file mode 100644 index 00000000000000..94fc1357b27c4c --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_a_emitter.cpp @@ -0,0 +1,90 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_brgemm_copy_a_emitter.hpp" + +#include "emitters/plugin/x64/utils.hpp" +#include "emitters/snippets/x64/utils.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" + +#include "snippets/utils/utils.hpp" + +#include "transformations/snippets/x64/op/brgemm_copy_a.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" + + +using namespace Xbyak; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::x64; +using namespace ov::intel_cpu::brgemm_utils; +using namespace ov::snippets::utils; + +namespace ov { +namespace intel_cpu { + +jit_brgemm_copy_a_emitter::jit_brgemm_copy_a_emitter(jit_generator* h, cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr, + const snippets::KernelExecutorTablePtr& kernel_table, + const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache) + : jit_emitter(h, isa) { + in_out_type_ = emitter_in_out_map::gpr_to_gpr; + const auto brgemm_repack = ov::as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(brgemm_repack, "expects BrgemmCopyA node"); + + // Note: even if the BrgemmCopyA node is dynamic, the first shapeInfer and RuntimeConfigurator::update() + // are performed before the BrgemmCopyAKernelExecutor registration. So we have to trigger update() manually + // for both static and the 1st dynamic shapes. + OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(0)->get_shape()), + "Jit emitter is called when the shapes are unknown"); + + const auto& brgemm_config = brgemm_repack->get_config(); + BrgemmCopyAKernelConfig kernel_config(brgemm_repack->get_input_element_type(0), brgemm_config.isa()); + m_kernel_executor = kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + + m_memory_offsets = {brgemm_repack->get_offset_in(), brgemm_repack->get_offset_out()}; + m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), utils::get_buffer_cluster_id(expr->get_output_port(0))}; +} + +void jit_brgemm_copy_a_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { + OV_CPU_JIT_EMITTER_ASSERT(in.size() == 1 && out.size() == 1, "expects 1 input and 1 output"); +} + +void jit_brgemm_copy_a_emitter::emit_impl(const std::vector& in, const std::vector& out) const { + validate_arguments(in, out); + + std::vector mem_ptrs_idxs{in[0], out[0]}; + + EmitABIRegSpills spill(h); + spill.preamble(); + + h->mov(h->rbp, reinterpret_cast(BrgemmCopyAKernelExecutor::execute)); + auto reserved_stack_size = sizeof(BrgemmCopyAKernelExecutor::call_args); + // Reserve memory on the stack + h->sub(h->rsp, reserved_stack_size); + + Xbyak::Reg64 aux_reg = ov::intel_cpu::utils::get_aux_gpr(mem_ptrs_idxs); + + const std::vector args_offsets {GET_OFF_BRGEMM_COPY_A_ARGS(src), GET_OFF_BRGEMM_COPY_A_ARGS(tr_src)}; + const auto& mem_ptrs = ov::intel_cpu::utils::transform_idxs_to_regs(mem_ptrs_idxs); + for (size_t i = 0; i < mem_ptrs.size(); i++) { + if (ov::snippets::utils::is_dynamic_value(m_memory_offsets[i])) + utils::push_ptr_with_runtime_offset_on_stack(h, args_offsets[i], mem_ptrs[i], aux_reg, + GET_OFF(buffer_offsets) + m_buffer_ids[i] * sizeof(size_t)); + else + utils::push_ptr_with_static_offset_on_stack(h, args_offsets[i], mem_ptrs[i], aux_reg, m_memory_offsets[i]); + } + + h->mov(abi_param1, reinterpret_cast(m_kernel_executor.get())); + h->mov(abi_param2, h->rsp); + + spill.rsp_align(); + h->call(h->rbp); + spill.rsp_restore(); + + h->add(h->rsp, reserved_stack_size); + + spill.postamble(); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_a_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_a_emitter.hpp new file mode 100644 index 00000000000000..a49be02b3ca578 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_a_emitter.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "emitters/plugin/x64/jit_emitter.hpp" + +#include "kernel_executors/brgemm_copy_a.hpp" + + +namespace ov { +namespace intel_cpu { + +class jit_brgemm_copy_a_emitter : public jit_emitter { +public: + jit_brgemm_copy_a_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr, + const snippets::KernelExecutorTablePtr& kernel_table, + const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache); + + size_t get_inputs_num() const override {return 1;} + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + return {{element::i8}, {element::u8}, {element::bf16}}; + } + +private: + void validate_arguments(const std::vector &in, const std::vector &out) const override; + void emit_impl(const std::vector& in, const std::vector& out) const override; + + std::vector m_memory_offsets{}; + std::vector m_buffer_ids{}; + std::shared_ptr m_kernel_executor {nullptr}; + +#ifdef SNIPPETS_DEBUG_CAPS + friend std::string init_info_jit_brgemm_copy_a_emitter(const jit_brgemm_copy_a_emitter *emitter); +#endif +}; + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp index e68ab224407c7b..56a3929be7de3c 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp @@ -48,18 +48,14 @@ jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, cpu_isa_t OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(0)->get_shape()), "Jit emitter is called when the shapes are unknown"); - const auto& in_subtensor = get_projected_subtensor(expr->get_input_port(0)); - const auto K_blk = *++in_subtensor.rbegin(); - const auto& src_prc = brgemm_repack->get_src_element_type(); const auto& wei_prc = brgemm_repack->get_input_element_type(0); const auto wei_N_blk = brgemm_utils::repacking::compute_inner_n_block(wei_prc); const auto is_transposed = get_is_transposed(expr); - const auto brgemm_type = get_brgemm_type(src_prc, K_blk, is_transposed); - const auto primitive_isa = brgemm_utils::get_primitive_isa(src_prc, with_amx(brgemm_type)); - m_with_comp = with_compensations(brgemm_type); + const auto& brgemm_config = brgemm_repack->get_config(); + m_with_comp = brgemm_config.need_compensations(); - BrgemmCopyBKernelConfig kernel_config(src_prc, wei_prc, primitive_isa, m_with_comp, is_transposed, wei_N_blk); + BrgemmCopyBKernelConfig kernel_config(src_prc, wei_prc, brgemm_config.isa(), m_with_comp, is_transposed, wei_N_blk); m_kernel_executor = kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); m_memory_offsets = {brgemm_repack->get_offset_in(), brgemm_repack->get_offset_out()}; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp index 057a3687ab8d16..4d1009e3cb59a0 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp @@ -26,12 +26,10 @@ jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, const auto& brgemm_node = as_type_ptr(expr->get_node()); const auto& brg0Prc = brgemm_node->get_input_element_type(0); const auto& brg1Prc = brgemm_node->get_input_element_type(1); - const auto brgemm_type = brgemm_node->get_type(); - BrgemmKernelConfig kernel_config(brg0Prc, brg1Prc, with_amx(brgemm_type), with_compensations(brgemm_type), - brgemm_utils::get_primitive_isa(brg0Prc, with_amx(brgemm_type))); - m_kernel_executor = kernel_table->register_kernel(expr, - compiled_kernel_cache, - kernel_config); + const auto& brgemm_config = brgemm_node->get_config(); + BrgemmKernelConfig kernel_config(brg0Prc, brg1Prc, brgemm_config.is_amx(), brgemm_config.need_compensations(), brgemm_config.isa()); + m_kernel_executor = kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + // Note: even if the Brgemm node is dynamic, the first shapeInfer and RuntimeConfigurator::update() // are performed before the BrgemmKernelExecutor registration. So we have to trigger update() manually // for both static and the 1st dynamic shapes. @@ -42,7 +40,7 @@ jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, m_memory_offsets = {brgemm_node->get_offset_a(), brgemm_node->get_offset_b(), brgemm_node->get_offset_c()}; m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), utils::get_buffer_cluster_id(expr->get_input_port(1)), utils::get_buffer_cluster_id(expr->get_output_port(0))}; - if (with_scratchpad(brgemm_type)) { + if (brgemm_node->get_input_size() == 3) { m_memory_offsets.push_back(brgemm_node->get_offset_scratch()); m_buffer_ids.push_back(utils::get_buffer_cluster_id(expr->get_input_port(2))); } @@ -51,29 +49,28 @@ jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, std::set> jit_brgemm_emitter::get_supported_precisions(const std::shared_ptr& node) { const auto brgemm = as_type_ptr(node); OV_CPU_JIT_EMITTER_ASSERT(brgemm, "get_supported_precisions() expects BrgemmCPU node"); - using brgemm_utils::BRGEMM_TYPE; - if (brgemm->get_type() == BRGEMM_TYPE::STAND_ALONE) { - return {{element::f32, element::f32}}; - } else if (brgemm->get_type() == BRGEMM_TYPE::REPACKING_ONLY) { + const auto& config = brgemm->get_config(); + if (config.need_compensations()) { + return {{element::i8, element::i8, element::f32}}; + } + if (config.is_amx()) { + return {{element::i8, element::i8, element::u8}, + {element::u8, element::i8, element::u8}, + {element::bf16, element::bf16, element::u8}}; + } + if (config.need_copy_b()) { std::set> supported_types = {{element::u8, element::i8}, {element::bf16, element::bf16}, {element::f32, element::f32}}; if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) supported_types.insert({element::i8, element::i8}); return supported_types; - } else if (brgemm->get_type() == BRGEMM_TYPE::WITH_COMPENSATIONS) { - return {{element::i8, element::i8, element::f32}}; - } else if (brgemm->get_type() == BRGEMM_TYPE::WITH_AMX) { - return {{element::i8, element::i8, element::u8}, - {element::u8, element::i8, element::u8}, - {element::bf16, element::bf16, element::u8}}; } - OV_CPU_JIT_EMITTER_THROW("got BrgemmCPU node with unsupported type"); + return {{element::f32, element::f32}}; } void jit_brgemm_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { - OV_CPU_JIT_EMITTER_ASSERT(m_memory_offsets.size() == in.size() + 1 && (out.size() == 1), - "expects 3 inputs if there are compensations/wsp"); + OV_CPU_JIT_EMITTER_ASSERT((m_memory_offsets.size() == in.size() + 1) && (out.size() == 1), "incorrect count of registers"); } void jit_brgemm_emitter::emit_impl(const std::vector& in, const std::vector& out) const { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp index 6f1f4ab93aeda9..83ef7d608c94e2 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp @@ -230,7 +230,7 @@ void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::Expression OPENVINO_ASSERT(in_ports.size() > 1 && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && out_ports.size() == 1 && check_port(out_ports.back()), "Incorrect Loop by Brgemm dimension M"); - M = current_expanded_loop_info->get_increment(); + M = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; input_pds[0]->set_subtensor_dim(1, M); output_pds[0]->set_subtensor_dim(1, M); } @@ -249,7 +249,7 @@ void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::Expression OPENVINO_ASSERT(in_ports.size() == 2 && !in_ports.front().is_incremented && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && out_ports.size() == 1 && check_port(out_ports.back()), "Incorrect Loop by Brgemm dimension N"); - N = current_expanded_loop_info->get_increment(); + N = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; input_pds[1]->set_subtensor_dim(0, N); output_pds[0]->set_subtensor_dim(0, N); } @@ -260,8 +260,9 @@ void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::Expression // the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0). // First of them will have `beta = 0`, other - `beta = 1` float beta = 0; + const auto K_dim = *in0_shape.rbegin(); if (ov::snippets::utils::is_full_dim_value(K)) { - K = *in0_shape.rbegin(); + K = K_dim; } else { const auto& current_expanded_loop_info = get_loop_info(); const auto& in_ports = current_expanded_loop_info->get_input_ports(); @@ -272,21 +273,26 @@ void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::Expression OPENVINO_ASSERT(in_ports.size() == 2 && in_ports.front().dim_idx == 0 && in_ports.back().dim_idx == 1 && out_ports.size() == 1 && !out_ports.front().is_incremented, "Incorrect Loop by Brgemm dimension K"); - K = current_expanded_loop_info->get_increment(); + K = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; input_pds[0]->set_subtensor_dim(0, K); input_pds[1]->set_subtensor_dim(1, K); if (K > 0) beta = get_beta(loop_manager, static_cast(loop_ids.back()), current_expanded_loop_info); } - const auto LDA = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(0))); - const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); + auto LDA = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(0))); auto LDB = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(1))); + const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); const auto& brgemm_node = as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(brgemm_node, "Got invalid node type in update_config"); // In case of data repacking LDB is chosen in accordance with repacking buffer size - if (with_repacking(brgemm_node->get_type())) - LDB = brgemm_utils::repacking::compute_out_leading_dim(N, brgemm_node->get_input_element_type(1)); + if (brgemm_node->get_config().need_copy_a()) { + const auto& src_type = brgemm_node->get_input_element_type(0); + K = rnd_up(K, brgemm_utils::compute_vnni_factor(src_type)); + LDA = brgemm_utils::repacking::compute_LDA(K, src_type); + } + if (brgemm_node->get_config().need_copy_b()) + LDB = brgemm_utils::repacking::compute_LDB(N, brgemm_node->get_input_element_type(1)); config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_a.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_a.cpp new file mode 100644 index 00000000000000..3bc01391def6e4 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_a.cpp @@ -0,0 +1,225 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_copy_a.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "emitters/plugin/x64/utils.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" + +#define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) + +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::x64; + +namespace ov { +namespace intel_cpu { + +BrgemmCopyAKernelConfig::BrgemmCopyAKernelConfig(const element::Type& src_dt, cpu_isa_t isa) + : m_static_params(std::make_shared(src_dt, isa)) { + m_hash = compute_hash(); +} + +bool BrgemmCopyAKernelConfig::is_completed() const { + return !utils::one_of(0, m_curr_M_blk, m_K, m_copy_A_wei_stride, m_LDA) || is_empty(); +} + +bool BrgemmCopyAKernelConfig::is_empty() const { + return everyone_is(0, m_curr_M_blk, m_K, m_copy_A_wei_stride, m_LDA); +} + +bool BrgemmCopyAKernelConfig::operator==(const BrgemmCopyAKernelConfig& rhs) const { +#define EQ(X) X == rhs.X + return EQ(m_hash) && EQ(m_curr_M_blk) && EQ(m_K) && EQ(m_copy_A_wei_stride) && EQ(m_LDA) && + (EQ(m_static_params.get()) || *m_static_params == *(rhs.m_static_params)); +#undef EQ +} + +void BrgemmCopyAKernelConfig::update(dnnl_dim_t cur_M_blk, dnnl_dim_t K, dnnl_dim_t copy_A_wei_stride, dnnl_dim_t LDA) { + // If one of the dims is zero, it means that BrgemmCopyB won't be executed (in Loop with work_amount = 0, for example) + // To process this case, we have to make this Config as empty (nullify runtime parameters) + if (utils::one_of(0, cur_M_blk, K)) { + m_curr_M_blk = 0; m_K = 0; + m_copy_A_wei_stride = 0; m_LDA = 0; + } else { + m_curr_M_blk = cur_M_blk; m_K = K; + m_copy_A_wei_stride = copy_A_wei_stride; m_LDA = LDA; + } + m_hash = compute_hash(); +} + +size_t BrgemmCopyAKernelConfig::compute_hash() const { + size_t seed = m_static_params->hash; +#define HASH(X) seed = hash_combine(seed, X) + HASH(m_curr_M_blk); HASH(m_K); + HASH(m_copy_A_wei_stride); HASH(m_LDA); +#undef HASH + return seed; +} + +BrgemmCopyAKernelConfig::StaticParams::StaticParams(const element::Type& etype, dnnl::impl::cpu::x64::cpu_isa_t isa) + : src_dt(DTYPE_CAST(etype)), isa(isa), K_blk(brgemm_utils::repacking::compute_inner_k_block(etype)), + vnni_factor(data_type_vnni_granularity(src_dt)), hash(init_hash(src_dt, isa, K_blk)) {} + +bool BrgemmCopyAKernelConfig::StaticParams::operator==(const StaticParams& rhs) const { +#define EQ(X) X == rhs.X + return EQ(hash) && EQ(src_dt) && EQ(isa) && EQ(K_blk); +#undef EQ +} + +size_t BrgemmCopyAKernelConfig::StaticParams::init_hash(const dnnl_data_type_t& src_dt, dnnl::impl::cpu::x64::cpu_isa_t isa, dnnl_dim_t K_blk) { + size_t seed = 0; +#define HASH(X) seed = hash_combine(seed, X) + HASH(src_dt); HASH(isa); HASH(K_blk); +#undef HASH + return seed; +} + +#ifdef SNIPPETS_DEBUG_CAPS +#define PRINT(X) ss << #X << " = " << X << "\n" +std::string BrgemmCopyAKernelConfig::to_string() const { + std::stringstream ss; + ss << m_static_params->to_string() << "\n"; + PRINT(m_hash); PRINT(m_curr_M_blk); PRINT(m_K); + PRINT(m_copy_A_wei_stride); PRINT(m_LDA); + return ss.str(); +} +std::string BrgemmCopyAKernelConfig::StaticParams::to_string() const { + std::stringstream ss; + PRINT(src_dt); PRINT(isa); PRINT(K_blk); + return ss.str(); +} +#undef PRINT +#endif + +BrgemmCopyAKernelExecutor::BrgemmCopyAKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmCopyAKernelConfig config) + : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) { } + +std::shared_ptr BrgemmCopyAKernelExecutor::compile_kernel(const BrgemmCopyAKernelConfig& config) const { + auto kernel = std::make_shared(); + + // BrgemmCopyA is not executable - nothing to compile + if (config.is_empty()) + return kernel; + + matmul::brgemm_matmul_conf_t conf; + conf.src_tag = dnnl_abcd; // unused + conf.K = config.get_K(); + conf.K_tail = config.get_K_tail(); + conf.K_blk = config.get_K_blk(); + conf.use_buffer_a_tail_only = false; + //padding K tail to K_blk, LDA is the stride for target tensor + conf.LDA = config.get_LDA(); + conf.has_zero_point_b = false; + conf.s8s8_compensation_required = false; + conf.wei_zp_type = dnnl::impl::cpu::x64::none; + conf.src_zp_type = dnnl::impl::cpu::x64::none; + conf.src_dt = config.get_src_dt(); + conf.copy_A_src_stride = config.get_copy_A_wei_stride(); + conf.a_dt_sz = dnnl_data_type_size(conf.src_dt); + // copied A has the same precision of original + conf.tr_a_dt_sz = dnnl_data_type_size(conf.src_dt); + conf.transposed_A = false; + conf.isa = config.get_isa(); + + auto status = create_brgemm_matmul_copy_a(kernel->compiled_kernel, &conf); + OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot create brgemm copy a kernel due to invalid params"); + + return kernel; +} + +void BrgemmCopyAKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmCopyAKernelConfig& config) const { + const auto& input_desc = expr->get_input_port_descriptor(0); + const auto& output_desc = expr->get_output_port_descriptor(0); + + const auto planar_shape = ov::snippets::utils::get_planar_vdims(expr->get_input_port(0)); + const auto& in_subtensor = input_desc->get_subtensor(); + + size_t loop_idx = 0; + const auto& loop_ids = expr->get_loop_ids(); + const auto& loop_manager = linear_ir->get_loop_manager(); + + auto get_blk = [&](size_t idx) { + OPENVINO_ASSERT(idx < planar_shape.size() && idx < in_subtensor.size(), "Index must be less than shape/subtensor rank!"); + const auto dim = *(planar_shape.rbegin() + idx); + size_t blk = *(in_subtensor.rbegin() + idx); + if (ov::snippets::utils::is_full_dim_value(blk)) { + blk = dim; + } else { + OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); + const auto& current_expanded_loop_info = loop_manager->get_loop_info(loop_ids[loop_idx++]); + blk = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_desc->set_subtensor_dim(idx, blk); + output_desc->set_subtensor_dim(idx, blk); + OV_CPU_JIT_EMITTER_ASSERT(blk <= dim, "BrgemmCopyA has incompatible subtensor dimensions"); + } + return blk; + }; + + // Dimension M + const size_t M_blk = get_blk(1); + // Dimension K + const size_t K_blk = get_blk(0); + + const auto& src_type = expr->get_node()->get_input_element_type(0); + const auto LDA = brgemm_utils::repacking::compute_LDA(K_blk, src_type); + const auto copy_A_wei_stride = ov::snippets::utils::get_dim_stride(expr->get_input_port(0), 1) * src_type.size(); + + config.update(M_blk, K_blk, copy_A_wei_stride, LDA); +} + +void BrgemmCopyAKernelExecutor::execute(const BrgemmCopyAKernelExecutor* executor, call_args* args) { + const auto& kernel = executor->get_kernel(); + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr kernel"); + OV_CPU_JIT_EMITTER_ASSERT(args, "has nullptr call args"); + OV_CPU_JIT_EMITTER_ASSERT(kernel->compiled_kernel, "has nullptr kernel"); + const auto& config = static_cast(executor->get_config()); + + auto ctx = matmul::jit_brgemm_matmul_copy_a_t::ctx_t(); + + ctx.current_M_blk = config.get_curr_M_blk(); + ctx.zp_b_compensation_buffer_ptr = nullptr; + ctx.zp_a_compensation_result_ptr = nullptr; + ctx.zp_b_neg_value_ptr = nullptr; + ctx.zp_ab_comp_ptr = nullptr; + + const uint8_t* src = reinterpret_cast(args->src); + uint8_t* tr_src = reinterpret_cast(args->tr_src); + + size_t start_in = 0; + size_t start_out = 0; + + const auto data_size = dnnl_data_type_size(config.get_src_dt()); + + auto add_ptr_increments = [&](size_t current_K) { + start_in += current_K * data_size; + start_out += current_K * data_size; + }; + + const size_t block_count = config.get_K() / config.get_K_blk(); + for (size_t i = 0; i < block_count; ++i) { + ctx.src = src + start_in; + ctx.tr_src = tr_src + start_out; + ctx.current_K_start = i * config.get_K_blk(); + ctx.current_K_blk = config.get_K_blk(); + + (*kernel->compiled_kernel)(&ctx); + + add_ptr_increments(config.get_K_blk()); + } + + if (config.get_K_tail()) { + ctx.src = src + start_in; + ctx.tr_src = tr_src + start_out; + ctx.current_K_start = block_count * config.get_K_blk(); + ctx.current_K_blk = config.get_K_tail(); + + (*kernel->compiled_kernel)(&ctx); + } +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_a.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_a.hpp new file mode 100644 index 00000000000000..07dfd1080c302c --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_a.hpp @@ -0,0 +1,104 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" + +#include +#include + + +namespace ov { +namespace intel_cpu { + +struct BrgemmCopyAKernelConfig : public snippets::KernelExecutorBase::GenericConfig { +public: + BrgemmCopyAKernelConfig() = default; + BrgemmCopyAKernelConfig(const element::Type& src_dt, dnnl::impl::cpu::x64::cpu_isa_t isa); + + bool operator==(const BrgemmCopyAKernelConfig& rhs) const; + bool operator!=(const BrgemmCopyAKernelConfig& rhs) const {return !(*this == rhs);} + + std::unique_ptr get_clone_ptr() const override { + return std::unique_ptr(new BrgemmCopyAKernelConfig(*this)); + } + + bool is_empty() const; + bool is_completed() const override; + + void update(dnnl_dim_t cur_M_blk, dnnl_dim_t K, dnnl_dim_t copy_A_wei_stride, dnnl_dim_t LDA); + + size_t hash() const override { return m_hash; } + + dnnl_data_type_t get_src_dt() const { return m_static_params->src_dt; } + dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { return m_static_params->isa; } + + dnnl_dim_t get_curr_M_blk() const { return m_curr_M_blk; } + dnnl_dim_t get_K() const { return m_K; } + dnnl_dim_t get_K_blk() const { return m_static_params->K_blk; } + dnnl_dim_t get_K_tail() const { return rnd_up(get_K() % get_K_blk(), m_static_params->vnni_factor); } + dnnl_dim_t get_copy_A_wei_stride() const { return m_copy_A_wei_stride; } + dnnl_dim_t get_LDA() const { return m_LDA; } + +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const override; +#endif + +private: + struct StaticParams { + StaticParams(const element::Type& src_dt, dnnl::impl::cpu::x64::cpu_isa_t isa); + + const dnnl_data_type_t src_dt {dnnl_data_type_undef}; + const dnnl::impl::cpu::x64::cpu_isa_t isa {dnnl::impl::cpu::x64::isa_undef}; + const dnnl_dim_t K_blk {0}; + const size_t vnni_factor {1}; + const size_t hash {0}; + + bool operator==(const StaticParams& rhs) const; + bool operator!=(const StaticParams& rhs) const { return !(*this == rhs); } + +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const; +#endif + + private: + static size_t init_hash(const dnnl_data_type_t& src_dt, dnnl::impl::cpu::x64::cpu_isa_t isa, dnnl_dim_t K_blk); + }; + + size_t compute_hash() const; + + std::shared_ptr m_static_params; + dnnl_dim_t m_curr_M_blk {0}; + dnnl_dim_t m_K {0}; + dnnl_dim_t m_copy_A_wei_stride {0}, m_LDA {0}; + size_t m_hash {SIZE_MAX}; +}; + +struct BrgemmCopyAKernel { + std::unique_ptr compiled_kernel = nullptr; +}; + +class BrgemmCopyAKernelExecutor : public CPUKernelExecutor { +public: + struct call_args { + const void* src = nullptr; + void* tr_src = nullptr; + }; + BrgemmCopyAKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmCopyAKernelConfig config); + + static void execute(const BrgemmCopyAKernelExecutor* executor, call_args* args); + +protected: + std::shared_ptr compile_kernel(const BrgemmCopyAKernelConfig& c) const override; + + void update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmCopyAKernelConfig& config) const override; +}; +#define GET_OFF_BRGEMM_COPY_A_ARGS(field) offsetof(BrgemmCopyAKernelExecutor::call_args, field) + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp index 17f8923ae9867b..3a7316c438bab7 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp @@ -291,7 +291,7 @@ void BrgemmCopyBKernelExecutor::update_config(const ov::snippets::lowered::Expre } else { OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); const auto& current_expanded_loop_info = loop_manager->get_loop_info(loop_ids[loop_idx++]); - blk = current_expanded_loop_info->get_increment(); + blk = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; input_desc->set_subtensor_dim(idx, blk); output_desc->set_subtensor_dim(idx, blk); OV_CPU_JIT_EMITTER_ASSERT(blk <= dim, "BrgemmCopyB has incompatible subtensor dimensions"); @@ -305,7 +305,7 @@ void BrgemmCopyBKernelExecutor::update_config(const ov::snippets::lowered::Expre init(N_dim, N_blk, 0); const auto& brg_weight_etype = expr->get_node()->get_input_element_type(0); - const auto LDB = brgemm_utils::repacking::compute_out_leading_dim(N_dim, brg_weight_etype); + const auto LDB = brgemm_utils::repacking::compute_LDB(N_dim, brg_weight_etype); const auto copy_B_wei_stride = ov::snippets::utils::get_dim_stride(expr->get_input_port(0), config.is_transposed_B() ? 0 : 1) * brg_weight_etype.size(); config.update(N_dim, N_blk, K_dim, K_blk, copy_B_wei_stride, LDB); diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 5003deabc0bd40..25db6d7be531d8 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -886,7 +886,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr()); - m_nthreads = std::min(parallel_get_max_threads(), static_cast(m_harness_work_amount)); + m_nthreads = 1; // std::min(parallel_get_max_threads(), static_cast(m_harness_work_amount)); m_buffer_scratchpad_size = snippet_config->buffer_scratchpad_size; OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), "Undefined buffer scratchpad size!"); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_a.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_a.cpp new file mode 100644 index 00000000000000..313c55852152bd --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_a.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_copy_a.hpp" + +#include "snippets/itt.hpp" +#include "snippets/utils/utils.hpp" +#include "utils/general_utils.h" + +namespace ov { +namespace intel_cpu { + +BrgemmCopyA::BrgemmCopyA(const Output& x, BrgemmConfig config, + const PortDescriptor& desc_in, const PortDescriptor& desc_out, std::vector layout_in) + : snippets::modifier::MemoryAccess(1, 1), op::Op({x}), m_config(config) { + set_output_size(1); + set_input_port_descriptor(desc_in, 0); + set_output_port_descriptor(desc_out, 0); + custom_constructor_validate_and_infer_types(std::move(layout_in)); +} + +BrgemmCopyA::BrgemmCopyA(const Output& x, BrgemmConfig config, + const size_t offset_in, const size_t offset_out, std::vector layout_in) + : BrgemmCopyA(x, std::move(config), PortDescriptor(0, offset_in), PortDescriptor(0, offset_out), layout_in) {} + +bool BrgemmCopyA::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(BrgemmCopyA_visit_attributes); + return MemoryAccess::visit_attributes(visitor); +} + +void BrgemmCopyA::custom_constructor_validate_and_infer_types(std::vector layout_input) { + INTERNAL_OP_SCOPE(BrgemmCopyA_ctor_validate_and_infer_types); + // During ctor call, BrgemmCopyA doesn't know his port descriptors. So we use port descs from source inputs + set_output_type(0, get_input_element_type(0), snippets::utils::get_planar_pshape(get_input_partial_shape(0), layout_input)); +} + +void BrgemmCopyA::validate_and_infer_types() { + INTERNAL_OP_SCOPE(BrgemmCopyA_validate_and_infer_types); + const auto layout = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(); + set_output_type(0, get_input_element_type(0), snippets::utils::get_planar_pshape(get_input_partial_shape(0), layout)); +} + +std::shared_ptr BrgemmCopyA::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(BrgemmCopyA_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_config, get_input_port_descriptor(0), get_output_port_descriptor(0), + snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout()); +} + +BrgemmCopyA::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { + const auto& op = ov::as_type_ptr(n); + OPENVINO_ASSERT(op, "Got invalid node in BrgemmCopyA::ShapeInfer"); + m_layout = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(n->input(0))->get_layout(); +} + +ov::snippets::IShapeInferSnippets::Result BrgemmCopyA::ShapeInfer::infer(const std::vector& input_shapes) { + OPENVINO_ASSERT(input_shapes.size() == 1, "Got unexpected number of input shapes"); + return {{ov::snippets::utils::get_planar_vdims(input_shapes[0].get(), m_layout)}, ov::snippets::ShapeInferStatus::success}; +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_a.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_a.hpp new file mode 100644 index 00000000000000..58454877ce365c --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_a.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/op/memory_access.hpp" +#include "openvino/op/op.hpp" + +#include "brgemm_utils.hpp" + +namespace ov { +namespace intel_cpu { + +/** +* @interface BrgemmCopyA +* @brief The operation for data repacking of first input of Brgem +* @ingroup snippets +*/ +class BrgemmCopyA : public snippets::modifier::MemoryAccess, public ov::op::Op { +public: + using BrgemmConfig = brgemm_utils::BrgemmConfig; + OPENVINO_OP("BrgemmCopyA", "SnippetsOpset"); + + BrgemmCopyA(const Output& x, BrgemmConfig config, const size_t offset_in = 0lu, const size_t offset_out = 0lu, std::vector layout_in = {}); + BrgemmCopyA(const Output& x, BrgemmConfig config, const PortDescriptor& desc_in, const PortDescriptor& desc_out, std::vector layout_in = {}); + BrgemmCopyA() = default; + + size_t get_offset_in() const { return get_input_offset(0); } + size_t get_offset_out() const { return get_output_offset(0); } + + const BrgemmConfig& get_config() const { return m_config; } + + bool visit_attributes(AttributeVisitor& visitor) override; + void validate_and_infer_types() override; + bool has_evaluate() const override { return false; } + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + class ShapeInfer : public snippets::IShapeInferSnippets { + std::vector m_layout {}; + public: + explicit ShapeInfer(const std::shared_ptr& n); + Result infer(const std::vector& input_shapes) override; + }; + +private: + void custom_constructor_validate_and_infer_types(std::vector layout_input = {}); + + const BrgemmConfig m_config {}; +}; +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp index 3bda760d2cc180..a9f8b47f81d559 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.cpp @@ -13,19 +13,19 @@ namespace intel_cpu { intel_cpu::BrgemmCopyB::BrgemmCopyB(const Output& x, const element::Type src_type, - BRGEMM_TYPE type, + BrgemmConfig config, const size_t offset_in, const size_t offset_out0, const size_t offset_out1, std::vector layout_input) - : snippets::modifier::MemoryAccess(1, with_compensations(type) ? 2 : 1), + : snippets::modifier::MemoryAccess(1, config.need_compensations() ? 2 : 1), op::Op({x}), - m_type(type), + m_config(std::move(config)), m_src_type(src_type) { - set_output_size(with_compensations(m_type) ? 2 : 1); + set_output_size(m_config.need_compensations() ? 2 : 1); set_input_port_descriptor({0, offset_in}, 0); set_output_port_descriptor({0, offset_out0}, 0); - if (with_compensations(m_type)) { + if (m_config.need_compensations()) { set_output_port_descriptor({0, offset_out1}, 1); } custom_constructor_validate_and_infer_types(std::move(layout_input)); @@ -33,36 +33,33 @@ intel_cpu::BrgemmCopyB::BrgemmCopyB(const Output& x, intel_cpu::BrgemmCopyB::BrgemmCopyB(const Output& x, const element::Type src_type, - BRGEMM_TYPE type, + BrgemmConfig config, const PortDescriptor& desc_in0, const PortDescriptor& desc_out0, const PortDescriptor& desc_out1, std::vector layout_input) - : snippets::modifier::MemoryAccess(1, with_compensations(type) ? 2 : 1), + : snippets::modifier::MemoryAccess(1, config.need_compensations() ? 2 : 1), op::Op({x}), - m_type(type), + m_config(std::move(config)), m_src_type(src_type) { - set_output_size(with_compensations(type) ? 2 : 1); + set_output_size(m_config.need_compensations() ? 2 : 1); set_input_port_descriptor(desc_in0, 0); set_output_port_descriptor(desc_out0, 0); - if (with_compensations(m_type)) { + if (m_config.need_compensations()) { set_output_port_descriptor(desc_out1, 1); } custom_constructor_validate_and_infer_types(std::move(layout_input)); } bool BrgemmCopyB::visit_attributes(AttributeVisitor& visitor) { - INTERNAL_OP_SCOPE(BrgemmRepack_visit_attributes); + INTERNAL_OP_SCOPE(BrgemmCopyB_visit_attributes); MemoryAccess::visit_attributes(visitor); visitor.on_attribute("src_type", m_src_type); - visitor.on_attribute("type", m_type); return true; } void BrgemmCopyB::custom_constructor_validate_and_infer_types(std::vector layout_input) { - INTERNAL_OP_SCOPE(BrgemmRepack_ctor_validate_and_infer_types); - OPENVINO_ASSERT(m_type == BRGEMM_TYPE::WITH_COMPENSATIONS || m_type == BRGEMM_TYPE::REPACKING_ONLY, - "Unsupported BRGEMM_TYPE value"); + INTERNAL_OP_SCOPE(BrgemmCopyB_ctor_validate_and_infer_types); // During ctor call, BrgemmCopyB doesn't know his port descriptors. // So we use port descs from source inputs const auto element_type = get_input_element_type(0); @@ -72,20 +69,20 @@ void BrgemmCopyB::custom_constructor_validate_and_infer_types(std::vectorget_shape()); const auto& planar_pshape = snippets::utils::get_planar_pshape(shape, port->get_layout()); set_output_type(0, element_type, planar_pshape); - if (with_compensations(m_type)) { + if (m_config.need_compensations()) { set_output_type(1, ov::element::f32, planar_pshape); } } @@ -96,17 +93,17 @@ void BrgemmCopyB::validate_element_type(const ov::element::Type& element_type) { } std::shared_ptr intel_cpu::BrgemmCopyB::clone_with_new_inputs(const OutputVector& new_args) const { - INTERNAL_OP_SCOPE(BrgemmRepack_clone_with_new_inputs); + INTERNAL_OP_SCOPE(BrgemmCopyB_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_src_type, m_type, + return std::make_shared(new_args.at(0), m_src_type, m_config, get_input_port_descriptor(0), get_output_port_descriptor(0), - with_compensations(m_type) ? get_output_port_descriptor(1) : PortDescriptor{}, + m_config.need_compensations() ? get_output_port_descriptor(1) : PortDescriptor{}, snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout()); } size_t BrgemmCopyB::get_offset_compensations() const { - OPENVINO_ASSERT(with_compensations(m_type) && get_output_size() == 2, + OPENVINO_ASSERT(m_config.need_compensations() && get_output_size() == 2, "The offset for compensations must be in BrgemmCopyB only with compensations and 2 outputs!"); return get_output_offset(1); } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp index 93a595a3c5f0e9..8326f9550223eb 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_copy_b.hpp @@ -22,13 +22,13 @@ namespace intel_cpu { */ class BrgemmCopyB : public snippets::modifier::MemoryAccess, public ov::op::Op { public: - using BRGEMM_TYPE = brgemm_utils::BRGEMM_TYPE; + using BrgemmConfig = brgemm_utils::BrgemmConfig; OPENVINO_OP("BrgemmCopyB", "SnippetsOpset"); - BrgemmCopyB(const Output& x, const element::Type src_type, BRGEMM_TYPE type = BRGEMM_TYPE::REPACKING_ONLY, + BrgemmCopyB(const Output& x, const element::Type src_type, BrgemmConfig config, const size_t offset_in = 0lu, const size_t offset_out0 = 0lu, const size_t offset_out1 = 0lu, std::vector layout_input = {}); - BrgemmCopyB(const Output& x, const element::Type src_type, BRGEMM_TYPE type, + BrgemmCopyB(const Output& x, const element::Type src_type, BrgemmConfig config, const PortDescriptor& desc_in0, const PortDescriptor& desc_out0, const PortDescriptor& desc_out1, std::vector layout_input = {}); BrgemmCopyB() = default; @@ -37,8 +37,8 @@ class BrgemmCopyB : public snippets::modifier::MemoryAccess, public ov::op::Op { size_t get_offset_out() const { return get_output_offset(0); } size_t get_offset_compensations() const; - BRGEMM_TYPE get_type() const { return m_type; } element::Type get_src_element_type() const { return m_src_type; } + const BrgemmConfig& get_config() const { return m_config; } bool visit_attributes(AttributeVisitor& visitor) override; void validate_and_infer_types() override; @@ -57,7 +57,7 @@ class BrgemmCopyB : public snippets::modifier::MemoryAccess, public ov::op::Op { void custom_constructor_validate_and_infer_types(std::vector layout_input = {}); void validate_element_type(const ov::element::Type& element_type); - BRGEMM_TYPE m_type = BRGEMM_TYPE::REPACKING_ONLY; + const BrgemmConfig m_config {}; element::Type m_src_type = ov::element::undefined; // src element type of the corresponding BRGEMM }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp index b40bd88f31726b..f5ee13bf5fe3c1 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp @@ -14,10 +14,10 @@ namespace ov { namespace intel_cpu { using namespace brgemm_utils; -BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, BRGEMM_TYPE type, +BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, BrgemmConfig config, const size_t offset_a, const size_t offset_b, const size_t offset_c, std::vector layout_a, std::vector layout_b, std::vector layout_c) - : Brgemm(), m_type(type) { + : Brgemm(), m_config(std::move(config)) { // We call default ctor of Brgemm class to avoid incorrect shape infer in constructor_validate_and_type_infer() call set_arguments({A, B}); set_output_size(1); @@ -28,10 +28,10 @@ BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, BRGEMM_TYPE t custom_constructor_validate_and_infer_types(std::move(layout_a), std::move(layout_b), std::move(layout_c)); } -BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, const Output& scratch, BRGEMM_TYPE type, +BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, const Output& scratch, BrgemmConfig config, const size_t offset_a, const size_t offset_b, const size_t offset_scratch, const size_t offset_c, std::vector layout_a, std::vector layout_b, std::vector layout_c) - : Brgemm(), m_type(type) { + : Brgemm(), m_config(std::move(config)) { set_arguments({A, B, scratch}); set_output_size(1); ctor_initialize(std::set{0, 1, 2}, std::set{0}); @@ -42,10 +42,10 @@ BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, const Output< custom_constructor_validate_and_infer_types(std::move(layout_a), std::move(layout_b), std::move(layout_c)); } -BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, BRGEMM_TYPE type, +BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, BrgemmConfig config, const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_c, std::vector layout_a, std::vector layout_b, std::vector layout_c) - : Brgemm(), m_type(type) { + : Brgemm(), m_config(std::move(config)) { set_arguments({A, B}); set_output_size(1); m_input_ports = {{0, desc_a}, {1, desc_b}}; @@ -53,10 +53,10 @@ BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, BRGEMM_TYPE t custom_constructor_validate_and_infer_types(std::move(layout_a), std::move(layout_b), std::move(layout_c)); } -BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, const Output& scratch, BRGEMM_TYPE type, +BrgemmCPU::BrgemmCPU(const Output& A, const Output& B, const Output& scratch, BrgemmConfig config, const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_scratch, const PortDescriptor& desc_c, std::vector layout_a, std::vector layout_b, std::vector layout_c) - : Brgemm(), m_type(type) { + : Brgemm(), m_config(std::move(config)) { set_arguments({A, B, scratch}); set_output_size(1); m_input_ports = {{0, desc_a}, {1, desc_b}, {2, desc_scratch}}; @@ -70,11 +70,9 @@ void BrgemmCPU::custom_constructor_validate_and_infer_types(std::vector // During ctor call, BrgemmCPU doesn't know his port descriptors. // So we use port descs from source inputs - const auto brgemm_copy = with_repacking(m_type) ? get_brgemm_copy() : nullptr; const auto planar_input_shapes = std::vector{ snippets::utils::get_planar_pshape(get_input_partial_shape(0), layout_a), - brgemm_copy ? snippets::utils::get_planar_pshape(brgemm_copy->input(0)) - : snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b) }; + snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b) }; auto output_shape = infer_output_partial_shape(planar_input_shapes); set_output_type(0, get_output_type(), snippets::utils::get_planar_pshape(output_shape, layout_c)); @@ -96,42 +94,54 @@ void BrgemmCPU::validate_and_infer_types() { void BrgemmCPU::validate_with_scratchpad() const { // Additional check for 3rd input - if (with_compensations(m_type)) { + if (m_config.need_compensations()) { OPENVINO_ASSERT(get_input_element_type(2) == ov::element::f32, "BRGEMM Scratch with compensations must have FP32 element type"); - } else if (with_amx(m_type)) { + } else if (m_config.need_wsp()) { OPENVINO_ASSERT(get_input_partial_shape(2).is_static(), "BRGEMM Scratch must have static shape"); OPENVINO_ASSERT(get_input_element_type(2) == ov::element::u8, "BRGEMM Scratch must have U8 element type"); } } void BrgemmCPU::validate_inputs() const { - OPENVINO_ASSERT(implication(one_of(m_type, BRGEMM_TYPE::STAND_ALONE, BRGEMM_TYPE::REPACKING_ONLY), get_input_size() == 2), - "BrgemmCPU expects 2 inputs in cases, when input precisions are f32|f32, u8|i8 or bf16|bf16 (non-AMX system)"); - OPENVINO_ASSERT(implication(one_of(m_type, BRGEMM_TYPE::WITH_COMPENSATIONS, BRGEMM_TYPE::WITH_AMX), get_input_size() == 3), - "BrgemmCPU expects 3 inputs with input precisions i8|i8 and bf16|bf16 on AMX system"); + if (m_config.need_compensations() || m_config.need_wsp()) + OPENVINO_ASSERT(get_input_size() == 3, "BrgemmCPU expects 3 inputs with input precisions i8|i8 and bf16|bf16 on AMX system"); + else + OPENVINO_ASSERT(get_input_size() == 2, "BrgemmCPU expects 2 inputs in cases, when input precisions are f32|f32, u8|i8 or bf16|bf16 (non-AMX system)"); } std::shared_ptr BrgemmCPU::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(BrgemmCPU_clone_with_new_inputs); check_new_args_count(this, new_args); - std::shared_ptr brgemm; - if (!with_scratchpad(m_type)) { - return std::make_shared(new_args.at(0), new_args.at(1), m_type, + if (get_input_size() == 2) { + return std::make_shared(new_args.at(0), new_args.at(1), m_config, get_input_port_descriptor(0), get_input_port_descriptor(1), get_output_port_descriptor(0), snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(), snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(1))->get_layout(), snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(output(0))->get_layout()); - } else { - return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), m_type, - get_input_port_descriptor(0), get_input_port_descriptor(1), get_input_port_descriptor(2), get_output_port_descriptor(0), - snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(), - snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(1))->get_layout(), - snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(output(0))->get_layout()); } + return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), m_config, + get_input_port_descriptor(0), get_input_port_descriptor(1), get_input_port_descriptor(2), get_output_port_descriptor(0), + snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(), + snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(1))->get_layout(), + snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(output(0))->get_layout()); } -std::shared_ptr BrgemmCPU::get_brgemm_copy() const { - OPENVINO_ASSERT(one_of(m_type, BRGEMM_TYPE::REPACKING_ONLY, BRGEMM_TYPE::WITH_COMPENSATIONS, BRGEMM_TYPE::WITH_AMX), "Brgemm doesn't need BrgemmCopyB"); +std::shared_ptr BrgemmCPU::get_brgemm_copy_a() const { + OPENVINO_ASSERT(m_config.need_copy_a(), "Brgemm doesn't need BrgemmCopyA"); + auto a_input_node = get_input_node_shared_ptr(0); + if (const auto brgemm_copy_a = ov::as_type_ptr(a_input_node)) { + return brgemm_copy_a; + } + if (ov::is_type(a_input_node)) { + if (const auto brgemm_copy_a = ov::as_type_ptr(a_input_node->get_input_node_shared_ptr(0))) { + return brgemm_copy_a; + } + } + OPENVINO_THROW("BrgemmCopyA hasn't been found!"); +} + +std::shared_ptr BrgemmCPU::get_brgemm_copy_b() const { + OPENVINO_ASSERT(m_config.need_copy_b(), "Brgemm doesn't need BrgemmCopyB"); auto b_input_node = get_input_node_shared_ptr(1); if (const auto brgemm_copy_b = ov::as_type_ptr(b_input_node)) { return brgemm_copy_b; @@ -145,15 +155,13 @@ std::shared_ptr BrgemmCPU::get_brgemm_copy() const { } size_t BrgemmCPU::get_offset_scratch() const { - OPENVINO_ASSERT(with_scratchpad(m_type) && get_input_size() == 3, "Offset of scratchpad must be only in Brgemm with scratchpad on 3rd input"); + OPENVINO_ASSERT(get_input_size() == 3, "Offset of scratchpad must be only in Brgemm with scratchpad on 3rd input"); return get_input_offset(2); } bool BrgemmCPU::visit_attributes(AttributeVisitor& visitor) { Brgemm::visit_attributes(visitor); - visitor.on_attribute("type", m_type); return true; } } // namespace intel_cpu - } // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp index a646ffc792fd6d..f5881282caf8f0 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp @@ -5,6 +5,7 @@ #pragma once #include "snippets/op/brgemm.hpp" +#include "brgemm_copy_a.hpp" #include "brgemm_copy_b.hpp" #include "brgemm_utils.hpp" @@ -21,19 +22,19 @@ namespace intel_cpu { */ class BrgemmCPU : public snippets::op::Brgemm { public: - using BRGEMM_TYPE = brgemm_utils::BRGEMM_TYPE; + using BrgemmConfig = brgemm_utils::BrgemmConfig; OPENVINO_OP("BrgemmCPU", "SnippetsOpset", snippets::op::Brgemm); - BrgemmCPU(const Output& A, const Output& B, BRGEMM_TYPE type, + BrgemmCPU(const Output& A, const Output& B, BrgemmConfig config, const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_c = 0, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}); - BrgemmCPU(const Output& A, const Output& B, const Output& scratch, BRGEMM_TYPE type, + BrgemmCPU(const Output& A, const Output& B, const Output& scratch, BrgemmConfig config, const size_t offset_a = 0, const size_t offset_b = 0, const size_t offset_scratch = 0, const size_t offset_c = 0, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}); - BrgemmCPU(const Output& A, const Output& B, BRGEMM_TYPE type, + BrgemmCPU(const Output& A, const Output& B, BrgemmConfig config, const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_c, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}); - BrgemmCPU(const Output& A, const Output& B, const Output& scratch, BRGEMM_TYPE type, + BrgemmCPU(const Output& A, const Output& B, const Output& scratch, BrgemmConfig config, const PortDescriptor& desc_a, const PortDescriptor& desc_b, const PortDescriptor& desc_scratch, const PortDescriptor& desc_c, std::vector layout_a = {}, std::vector layout_b = {}, std::vector layout_c = {}); BrgemmCPU() = default; @@ -41,10 +42,11 @@ class BrgemmCPU : public snippets::op::Brgemm { void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - BRGEMM_TYPE get_type() const { return m_type; } + const BrgemmConfig& get_config() const { return m_config; } size_t get_offset_scratch() const; - std::shared_ptr get_brgemm_copy() const; + std::shared_ptr get_brgemm_copy_a() const; + std::shared_ptr get_brgemm_copy_b() const; bool visit_attributes(AttributeVisitor& visitor) override; @@ -55,7 +57,7 @@ class BrgemmCPU : public snippets::op::Brgemm { void validate_with_scratchpad() const; void validate_inputs() const; - BRGEMM_TYPE m_type = BRGEMM_TYPE::STAND_ALONE; + const BrgemmConfig m_config {}; }; } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index 844ec338b8a83b..506209f8d10a67 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -19,49 +19,45 @@ namespace ov { namespace intel_cpu { namespace brgemm_utils { -cpu_isa_t get_primitive_isa(const ov::element::Type& dt_in0, bool is_with_amx) { - auto isa = isa_undef; -#define SUPPORT(X, Y) if (mayiuse(X)) { isa = X; } else { Y } -#define SUPPORT_ONE(X, MESSAGE) SUPPORT(X, OV_CPU_JIT_EMITTER_THROW(MESSAGE);) -#define SUPPORT_TWO(X, Y, MESSAGE) SUPPORT(X, SUPPORT_ONE(Y, MESSAGE)) -#define SUPPORT_THREE(X, Y, Z, MESSAGE) SUPPORT(X, SUPPORT_TWO(Y, Z, MESSAGE)) - - // Note: AMX might be not used even if it's supported by the hardware, check the BrgemmToBrgemmCPU pass for details - if (is_with_amx) { - SUPPORT_ONE(avx512_core_amx, "Unsupported hardware configuration: amx is supported only on avx512 platforms") - } else if (dt_in0 == ov::element::bf16) { - SUPPORT_ONE(avx512_core_bf16, "Unsupported hardware configuration: bf16 is supported only on avx512 platforms") - } else if (one_of(dt_in0, ov::element::u8, ov::element::i8)) { - SUPPORT_THREE(avx512_core_vnni, avx2_vnni_2, avx2_vnni, "Unsupported hardware configuration: int8 is supported only on vnni platforms") - } else { - SUPPORT_TWO(avx512_core, cpu::x64::avx2, "Unsupported hardware configuration: brgemm requires at least avx2 isa") +BrgemmConfig::BrgemmConfig(const ov::element::Type& src_dt, const ov::element::Type& wei_dt, size_t K, bool transposed_b) { + const auto is_fp32 = src_dt == ov::element::f32 && wei_dt == ov::element::f32; + const auto is_bf16 = src_dt == ov::element::bf16 && wei_dt == ov::element::bf16; + const auto is_int8 = (src_dt == ov::element::i8 || src_dt == ov::element::u8) && wei_dt == ov::element::i8; + OPENVINO_ASSERT(is_fp32 || is_bf16 || is_int8, "Incorrect configuration"); + + // Init ISA + if (is_bf16) { + m_isa = mayiuse(avx512_core_amx) ? avx512_core_amx : + mayiuse(avx512_core_bf16) ? avx512_core_bf16 : isa_undef; + } else if (is_int8) { + m_isa = mayiuse(avx512_core_amx) ? avx512_core_amx : + mayiuse(avx512_core_vnni) ? avx512_core_vnni : + mayiuse(avx2_vnni_2) ? avx2_vnni_2 : + mayiuse(avx2_vnni) ? avx2_vnni : isa_undef; + } else if (is_fp32) { + m_isa = mayiuse(avx512_core) ? avx512_core : + mayiuse(cpu::x64::avx2) ? cpu::x64::avx2 : isa_undef; } - return isa; -#undef SUPPORT_TWO -#undef SUPPORT_ONE -#undef SUPPORT + OPENVINO_ASSERT(m_isa != isa_undef, "ISA is undefined!"); + + m_need_copy_a = is_amx() && (is_dynamic_value(K) || (K % compute_vnni_factor(src_dt) != 0)); + m_need_copy_b = !is_fp32 || transposed_b; + + m_need_compensations = src_dt == ov::element::i8 && !one_of(m_isa, avx512_core_amx, avx2_vnni_2); + m_need_wsp = m_isa == avx512_core_amx; + + validate(); +} + +BrgemmConfig::BrgemmConfig(const ov::element::Type& src_dt, cpu_isa_t isa, bool need_copy_a, bool need_copy_b, bool need_compensations, bool need_wsp) + : m_isa(isa), m_need_copy_a(need_copy_a), m_need_copy_b(need_copy_b), m_need_compensations(need_compensations), m_need_wsp(need_wsp) { + validate(); } -BRGEMM_TYPE get_brgemm_type(const ov::element::Type& element_type_a, const Dimension& K_dim, bool transpose_b) { - if (element_type_a == element::f32) - return transpose_b ? BRGEMM_TYPE::REPACKING_ONLY : BRGEMM_TYPE::STAND_ALONE; - - OPENVINO_ASSERT(element_type_a != element::bf16 || mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16), - "BF16 precision is not supported on this hardware"); - - const auto brgemmVNNIFactor = 4 / element_type_a.size(); - if (one_of(element_type_a, element::u8, element::i8, element::bf16) && - dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) && - K_dim.is_static() && K_dim.get_length() % brgemmVNNIFactor == 0) - return BRGEMM_TYPE::WITH_AMX; - // Note: this condition reproduces logic from the OneDNN Brgemm implementation. This is needed to align with the - // backend requirements. More details in onednn/src/cpu/x64/brgemm/brgemm_utils.cpp - if (element_type_a == ov::element::i8) - return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2) ? BRGEMM_TYPE::REPACKING_ONLY : BRGEMM_TYPE::WITH_COMPENSATIONS; - - if (one_of(element_type_a, element::u8, ov::element::bf16)) - return BRGEMM_TYPE::REPACKING_ONLY; - OV_CPU_JIT_EMITTER_THROW("Failed to determine brgemm mode"); +void BrgemmConfig::validate() const { + OPENVINO_ASSERT(m_isa != isa_undef, "ISA is undefined"); + OPENVINO_ASSERT(IMPLICATION(m_need_wsp, is_amx()), "Scratchpad with empty memory is needed only for AMX"); + OPENVINO_ASSERT(IMPLICATION(m_need_compensations, !is_amx() && m_need_copy_b), "Compensations must be only with BrgemmCopyB on non-amx platforms"); } size_t compute_vnni_factor(const ov::element::Type& precision) { @@ -76,7 +72,12 @@ size_t get_elems_in_vec(const ov::element::Type& precision) { } namespace repacking { -size_t compute_out_leading_dim(const size_t n_block, const ov::element::Type& precision) { + +size_t compute_LDA(const size_t k_block, const ov::element::Type& precision) { + return rnd_up(k_block, compute_inner_k_block(precision)); +} + +size_t compute_LDB(const size_t n_block, const ov::element::Type& precision) { return std::max(n_block, compute_inner_n_block(precision)); } @@ -88,17 +89,11 @@ size_t compute_inner_n_block(const ov::element::Type& precision) { default: OPENVINO_THROW("BrgemmCopyB doesn't support precision ", precision); } } + +size_t compute_inner_k_block(const ov::element::Type& precision) { + return brgemm_utils::get_elems_in_vec(precision); +} } // namespace repacking } // namespace brgemm_utils } // namespace intel_cpu -template <> -EnumNames& EnumNames::get() { - static auto enum_names = - EnumNames("ov::intel_cpu::jit_bgremm_utils::BRGEMM_TYPE", - {{"stand_alone", ov::intel_cpu::brgemm_utils::BRGEMM_TYPE::STAND_ALONE}, - {"with_amx", ov::intel_cpu::brgemm_utils::BRGEMM_TYPE::WITH_AMX}, - {"with_compensations", ov::intel_cpu::brgemm_utils::BRGEMM_TYPE::WITH_COMPENSATIONS}, - {"repacking_only", ov::intel_cpu::brgemm_utils::BRGEMM_TYPE::REPACKING_ONLY}}); - return enum_names; -} } // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index bc627c59920c4b..3ff1419b2ba512 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -13,28 +13,29 @@ namespace ov { namespace intel_cpu { namespace brgemm_utils { -enum class BRGEMM_TYPE { - STAND_ALONE, // No extra requirements, used for f32|f32 - WITH_AMX, // i8|i8 or bf16|bf16 on AMX system - needs BrgemmCopyB and scratchpad - WITH_COMPENSATIONS, // i8|i8 (non-AMX system) - needs BrgemmCopyB for data repacking and compensations - REPACKING_ONLY // u8|i8 or bf16|bf16 (non-AMX system) - needs BrgemmCopyB on second input for data repacking -}; - -dnnl::impl::cpu::x64::cpu_isa_t get_primitive_isa(const ov::element::Type& dt_in0, bool is_with_amx); - -BRGEMM_TYPE get_brgemm_type(const element::Type& element_type_a, const Dimension& K_dim, bool transpose_b); - -inline bool stand_alone(BRGEMM_TYPE type) { return type == BRGEMM_TYPE::STAND_ALONE; } - -inline bool with_amx(BRGEMM_TYPE type) { return type == BRGEMM_TYPE::WITH_AMX; } - -inline bool with_compensations(BRGEMM_TYPE type) { return type == BRGEMM_TYPE::WITH_COMPENSATIONS; } +class BrgemmConfig { +public: + BrgemmConfig() = default; + BrgemmConfig(const ov::element::Type& src_dt, const ov::element::Type& wei_dt, size_t K, bool transposed_b); + BrgemmConfig(const ov::element::Type& src_dt, dnnl::impl::cpu::x64::cpu_isa_t isa, + bool need_copy_a = false, bool need_copy_b = false, bool need_compensations = false, bool need_wsp = false); -inline bool repacking_only(BRGEMM_TYPE type) { return type == BRGEMM_TYPE::REPACKING_ONLY; } + dnnl::impl::cpu::x64::cpu_isa_t isa() const { return m_isa; } + bool is_amx() const { return m_isa == dnnl::impl::cpu::x64::cpu_isa_t::avx512_core_amx; } + bool need_copy_a() const { return m_need_copy_a; } + bool need_copy_b() const { return m_need_copy_b; } + bool need_compensations() const { return m_need_compensations; } + bool need_wsp() const { return m_need_wsp; } -inline bool with_repacking(BRGEMM_TYPE type) { return type != BRGEMM_TYPE::STAND_ALONE; } +private: + void validate() const; -inline bool with_scratchpad(BRGEMM_TYPE type) { return with_compensations(type) || with_amx(type); } + dnnl::impl::cpu::x64::cpu_isa_t m_isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + bool m_need_copy_a = false; + bool m_need_copy_b = false; + bool m_need_compensations = false; + bool m_need_wsp = false; +}; /// \brief Computes VNNI factor used by OneDNN implementation. Depends on tensor precision size_t compute_vnni_factor(const ov::element::Type& precision); @@ -42,24 +43,23 @@ size_t compute_vnni_factor(const ov::element::Type& precision); size_t get_elems_in_vec(const ov::element::Type& precision); namespace repacking { +/** + * @brief Computes leading dimension (LDA) which must be used in brgemm and brgemm_copy_a emitters + * @param n_block N block size shared between BrgemmCPU and BrgemmCopyA node + * @param precision tensor precision + */ +size_t compute_LDA(const size_t k_block, const ov::element::Type& precision); /** * @brief Computes leading dimension (LDB) which must be used in brgemm and brgemm_copy_b emitters * @param n_block N block size shared between BrgemmCPU and BrgemmCopyB node * @param precision tensor precision */ -size_t compute_out_leading_dim(const size_t n_block, const ov::element::Type& precision); +size_t compute_LDB(const size_t n_block, const ov::element::Type& precision); /// \brief Computes inner N block size used by OneDNN implementation. Depends on tensor precision size_t compute_inner_n_block(const ov::element::Type& precision); +/// \brief Computes inner K block size used by OneDNN implementation. Depends on tensor precision +size_t compute_inner_k_block(const ov::element::Type& precision); } // namespace repacking } // namespace brgemm_utils } // namespace intel_cpu -template <> -class AttributeAdapter : - public EnumAttributeAdapterBase { -public: - AttributeAdapter(intel_cpu::brgemm_utils::BRGEMM_TYPE& value) : - EnumAttributeAdapterBase(value) { - } - OPENVINO_RTTI("AttributeAdapter"); -}; } // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp index abb6147bac3588..6a7f7b821a7809 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp @@ -9,6 +9,7 @@ #include "snippets/utils/utils.hpp" #include "snippets/op/brgemm.hpp" #include "snippets/op/buffer.hpp" +#include "transformations/snippets/x64/op/brgemm_copy_a.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/tpp/x64/op/modifiers.hpp" @@ -57,70 +58,88 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { const auto& brgemm_in1_desc = PortDescriptorUtils::get_port_descriptor_ptr(brgemm->input(1)); const auto& brgemm_out_desc = PortDescriptorUtils::get_port_descriptor_ptr(brgemm->output(0)); - const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input(0)); const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input(1)); + const auto K = ov::snippets::utils::dimension_to_size_t(*++dimsMatMulIn1.rbegin()); + const auto element_type_a = brgemm->get_input_element_type(0); + const auto element_type_b = brgemm->get_input_element_type(1); - const auto K = *dimsMatMulIn0.rbegin(); + std::shared_ptr brgemm_cpu = nullptr; + std::shared_ptr brgemm_copy_a = nullptr; + std::shared_ptr brgemm_copy_b = nullptr; - const auto& layout_a = brgemm_in0_desc->get_layout(); - const auto& layout_b = brgemm_in1_desc->get_layout(); - const auto& layout_c = brgemm_out_desc->get_layout(); + auto brgemm_in0 = brgemm->input_value(0); + auto brgemm_in1 = brgemm->input_value(1); + + auto layout_a = brgemm_in0_desc->get_layout(); + auto layout_b = brgemm_in1_desc->get_layout(); + auto layout_c = brgemm_out_desc->get_layout(); + + auto offset_a = brgemm->get_offset_a(); + auto offset_b = brgemm->get_offset_b(); + auto offset_c = brgemm->get_offset_c(); - const auto element_type_a = brgemm->get_input_element_type(0); const bool transpose_b = !layout_b.empty() && layout_b.back() != layout_b.size() - 1; - const auto brgemm_type = brgemm_utils::get_brgemm_type(element_type_a, K, transpose_b); - const auto offset_a = brgemm->get_offset_a(); - const auto offset_b = brgemm->get_offset_b(); - const auto offset_c = brgemm->get_offset_c(); + const auto brgemm_config = brgemm_utils::BrgemmConfig(element_type_a, element_type_b, K, transpose_b); - std::shared_ptr brgemm_cpu = nullptr; - std::shared_ptr brgemm_repacking = nullptr; - if (stand_alone(brgemm_type)) { - brgemm_cpu = std::make_shared(brgemm->input_value(0), brgemm->input_value(1), brgemm_type, - offset_a, offset_b, offset_c, layout_a, layout_b, layout_c); + if (brgemm_config.need_copy_a()) { + brgemm_copy_a = std::make_shared(brgemm_in0, brgemm_config, offset_a, 0, layout_a); + PortDescriptorUtils::set_port_descriptor(brgemm_copy_a->input(0), brgemm_in0_desc->get_subtensor(), layout_a); + set_full_port_desc(brgemm_copy_a->output(0)); + + brgemm_in0 = brgemm_copy_a->output(0); + layout_a.clear(); + offset_a = 0; + } + + if (brgemm_config.need_copy_b()) { + brgemm_copy_b = std::make_shared(brgemm_in1, element_type_a, brgemm_config, offset_b, 0, 0, layout_b); + PortDescriptorUtils::set_port_descriptor(brgemm_copy_b->input(0), brgemm_in1_desc->get_subtensor(), layout_b); + for (const auto& out : brgemm_copy_b->outputs()) + set_full_port_desc(out); + + brgemm_in1 = brgemm_copy_b->output(0); + layout_b.clear(); + offset_b = 0; + } + + if (brgemm_config.need_wsp()) { + const auto scratch = std::make_shared(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); + brgemm_cpu = std::make_shared(brgemm_in0, brgemm_in1, scratch, brgemm_config, + offset_a, offset_b, 0, offset_c, layout_a, layout_b, layout_c); + + set_full_port_desc(scratch->output(0)); + set_full_port_desc(brgemm_cpu->input(2)); + } else if (brgemm_config.need_compensations()) { + OPENVINO_ASSERT(brgemm_copy_b, "Needs to BrgemmCopyB"); + brgemm_cpu = std::make_shared(brgemm_in0, brgemm_in1, brgemm_copy_b->output(1), brgemm_config, + offset_a, offset_b, 0, offset_c, layout_a, layout_b, layout_c); } else { - const auto copy_b_type = with_compensations(brgemm_type) ? brgemm_type : brgemm_utils::BRGEMM_TYPE::REPACKING_ONLY; - brgemm_repacking = std::make_shared(brgemm->input_value(1), element_type_a, copy_b_type, offset_b, 0, 0, layout_b); - PortDescriptorUtils::set_port_descriptor(brgemm_repacking->input(0), brgemm_in1_desc->get_subtensor(), layout_b); - for (const auto& output : brgemm_repacking->outputs()) - set_full_port_desc(output); - - if (with_amx(brgemm_type)) { - const auto scratch = std::make_shared(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); - brgemm_cpu = std::make_shared(brgemm->input_value(0), brgemm_repacking->output(0), scratch, brgemm_type, - offset_a, offset_b, 0, offset_c, - layout_a, std::vector{}, layout_c); - set_full_port_desc(scratch->output(0)); - set_full_port_desc(brgemm_cpu->input(2)); - } else if (with_compensations(brgemm_type)) { - brgemm_cpu = std::make_shared(brgemm->input_value(0), brgemm_repacking->output(0), brgemm_repacking->output(1), - brgemm_type, offset_a, offset_b, 0, offset_c, - layout_a, std::vector{}, layout_c); - } else if (repacking_only(brgemm_type)) { - brgemm_cpu = std::make_shared(brgemm->input_value(0), brgemm_repacking->output(0), brgemm_type, - offset_a, offset_b, offset_c, - layout_a, std::vector{}, layout_c); - } else { - OPENVINO_THROW("Invalid configuration for BRGEMM CPU"); - } + brgemm_cpu = std::make_shared(brgemm_in0, brgemm_in1, brgemm_config, offset_a, offset_b, offset_c, + layout_a, layout_b, layout_c); } brgemm_cpu->set_friendly_name(brgemm->get_friendly_name()); ov::replace_node(brgemm, brgemm_cpu); - // Transfer ports - PortDescriptorUtils::set_port_descriptor(brgemm_cpu->input(0), brgemm_in0_desc->get_subtensor(), layout_a); - if (brgemm_repacking) { + // need to run validate_and_infer_types manually: either input shapes were updated or + // output Layout was updated (out shape will be updated in validate_and_infer_types()) + + if (brgemm_copy_a) { + set_full_port_desc(brgemm_cpu->input(0)); + brgemm_copy_a->validate_and_infer_types(); + } else { + PortDescriptorUtils::set_port_descriptor(brgemm_cpu->input(0), brgemm_in0_desc->get_subtensor(), layout_a); + } + + if (brgemm_copy_b) { set_full_port_desc(brgemm_cpu->input(1)); + brgemm_copy_b->validate_and_infer_types(); } else { PortDescriptorUtils::set_port_descriptor(brgemm_cpu->input(1), brgemm_in1_desc->get_subtensor(), layout_b); } + PortDescriptorUtils::set_port_descriptor(brgemm_cpu->output(0), brgemm_out_desc->get_subtensor(), layout_c); - // need to run validate_and_infer_types manually: either input shapes were updated or - // output Layout was updated (out shape will be updated in validate_and_infer_types()) - if (brgemm_repacking) - brgemm_repacking->validate_and_infer_types(); brgemm_cpu->validate_and_infer_types(); return true; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp index 51565537c43568..0dc1be27048d9d 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp @@ -44,15 +44,26 @@ LinearIR::constExprIt BrgemmCPUBlocking::move_new_memory_buffer(LinearIR& linear return std::prev(brgemm_it); } -LinearIR::constExprIt BrgemmCPUBlocking::get_loop_begin_pos(LinearIR& linear_ir, const LinearIR::constExprIt& brgemm_it, const ExpressionPtr& copy_b_expr) { +LinearIR::constExprIt BrgemmCPUBlocking::move_brgemm_copy_a(LinearIR& linear_ir, const LinearIR::constExprIt& insert_it, + const LinearIR::constExprIt& brgemm_copy_a_it) { + if (*brgemm_copy_a_it != *std::prev(insert_it)) { + linear_ir.move(brgemm_copy_a_it, insert_it); + } + return std::prev(insert_it); +} + +LinearIR::constExprIt BrgemmCPUBlocking::get_loop_begin_pos(LinearIR& linear_ir, const LinearIR::constExprIt& brgemm_it, + const ExpressionPtr& copy_a_expr, const ExpressionPtr& copy_b_expr) { auto loop_begin_it = brgemm_it; const auto& brgemm_expr = *brgemm_it; const auto brgemm = ov::as_type_ptr(brgemm_expr->get_node()); OPENVINO_ASSERT(brgemm, "get_loop_begin_pos must be called only for BrgemmCPU expression"); - if (with_amx(brgemm->get_type())) + if (brgemm->get_config().is_amx()) loop_begin_it = move_new_memory_buffer(linear_ir, brgemm_it); if (copy_b_expr) loop_begin_it = linear_ir.find(copy_b_expr); + if (copy_a_expr) + loop_begin_it = move_brgemm_copy_a(linear_ir, loop_begin_it, linear_ir.find(copy_a_expr)); return loop_begin_it; } @@ -66,7 +77,7 @@ std::tuple BrgemmCPUBlocking::get_blocking_params(const size_t m_blk, n_blk, k_blk; std::tie(m_blk, n_blk, k_blk) = BrgemmBlockingBase::get_blocking_params(brgemm_expr); - if (with_repacking(brgemm->get_type())) { + if (brgemm->get_config().need_copy_b()) { n_blk = get_full_dim_value(); k_blk = get_full_dim_value(); } @@ -86,46 +97,57 @@ bool BrgemmCPUBlocking::mark_blocking_loops(LinearIR& linear_ir, size_t k_block) { const auto& brgemm_expr = *brgemm_it; const auto brgemm = ov::as_type_ptr(brgemm_expr->get_node()); - const auto type = brgemm->get_type(); + const auto& config = brgemm->get_config(); - if (stand_alone(type)) + if (!config.need_copy_a() && !config.need_copy_b()) return ov::snippets::lowered::pass::BrgemmBlockingBase::mark_blocking_loops(linear_ir, brgemm_it, m_block, n_block, k_block); brgemm_expr->get_input_port_descriptor(0)->set_subtensor({m_block, k_block}); brgemm_expr->get_input_port_descriptor(1)->set_subtensor({k_block, n_block}); brgemm_expr->get_output_port_descriptor(0)->set_subtensor({m_block, n_block}); - const auto copy_b_expr = linear_ir.get_expr_by_node(brgemm->get_brgemm_copy()); - copy_b_expr->get_input_port_descriptor(0)->set_subtensor({k_block, n_block}); - copy_b_expr->get_output_port_descriptor(0)->set_subtensor({k_block, n_block}); - if (with_compensations(type)) { - const ov::snippets::VectorDims compensations_subtensor{1, n_block}; - OPENVINO_ASSERT(brgemm_expr->get_input_count() == 3, "Brgemm must have 3 inputs in case of compensations."); - brgemm_expr->get_input_port_descriptor(2)->set_subtensor(compensations_subtensor); - copy_b_expr->get_output_port_descriptor(1)->set_subtensor(compensations_subtensor); + ExpressionPtr copy_a_expr = nullptr, copy_b_expr = nullptr; + if (config.need_copy_a()) { + copy_a_expr = linear_ir.get_expr_by_node(brgemm->get_brgemm_copy_a()); + copy_a_expr->get_input_port_descriptor(0)->set_subtensor({m_block, ov::snippets::utils::get_full_dim_value()}); + copy_a_expr->get_output_port_descriptor(0)->set_subtensor({m_block, ov::snippets::utils::get_full_dim_value()}); + } + if (config.need_copy_b()) { + copy_b_expr = linear_ir.get_expr_by_node(brgemm->get_brgemm_copy_b()); + copy_b_expr->get_input_port_descriptor(0)->set_subtensor({k_block, n_block}); + copy_b_expr->get_output_port_descriptor(0)->set_subtensor({k_block, n_block}); + if (config.need_compensations()) { + const ov::snippets::VectorDims compensations_subtensor{1, n_block}; + OPENVINO_ASSERT(brgemm_expr->get_input_count() == 3, "Brgemm must have 3 inputs in case of compensations."); + brgemm_expr->get_input_port_descriptor(2)->set_subtensor(compensations_subtensor); + copy_b_expr->get_output_port_descriptor(1)->set_subtensor(compensations_subtensor); + } } const auto& loop_manager = linear_ir.get_loop_manager(); if (!is_full_dim_value(k_block)) { - const auto loop_begin = get_loop_begin_pos(linear_ir, brgemm_it, copy_b_expr); + const auto loop_begin = get_loop_begin_pos(linear_ir, brgemm_it, nullptr, copy_b_expr); + const auto loop_in1 = copy_b_expr ? copy_b_expr->get_input_port(0) : brgemm_expr->get_input_port(1); const std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true, 0), - LoopPort(copy_b_expr->get_input_port(0), true, 1)}; + LoopPort(loop_in1, true, 1)}; const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), false)}; mark_k_blocking(loop_manager, loop_begin, std::next(brgemm_it), entries, exits, k_block); } if (!is_full_dim_value(n_block)) { - const auto loop_begin = get_loop_begin_pos(linear_ir, brgemm_it, copy_b_expr); + const auto loop_begin = get_loop_begin_pos(linear_ir, brgemm_it, nullptr, copy_b_expr); + const auto loop_in1 = copy_b_expr ? copy_b_expr->get_input_port(0) : brgemm_expr->get_input_port(1); const std::vector entries{LoopPort(brgemm_expr->get_input_port(0), false), - LoopPort(copy_b_expr->get_input_port(0), true)}; + LoopPort(loop_in1, true)}; const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; mark_n_blocking(loop_manager, loop_begin, std::next(brgemm_it), entries, exits, n_block); } if (!is_full_dim_value(m_block)) { - const bool include_repacking = !is_full_dim_value(k_block) || !is_full_dim_value(n_block); - const auto loop_begin = get_loop_begin_pos(linear_ir, brgemm_it, include_repacking ? copy_b_expr : nullptr); - const auto b_input_port = include_repacking ? copy_b_expr->get_input_port(0) : brgemm_expr->get_input_port(1); - std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true), LoopPort(b_input_port, false)}; - if (!include_repacking && with_compensations(type)) + const bool include_copy_b = !is_full_dim_value(k_block) || !is_full_dim_value(n_block); + const auto loop_begin = get_loop_begin_pos(linear_ir, brgemm_it, copy_a_expr, include_copy_b ? copy_b_expr : nullptr); + const auto loop_in0 = copy_a_expr ? copy_a_expr->get_input_port(0) : brgemm_expr->get_input_port(0); + const auto loop_in1 = include_copy_b ? copy_b_expr->get_input_port(0) : brgemm_expr->get_input_port(1); + std::vector entries{LoopPort(loop_in0, true), LoopPort(loop_in1, false)}; + if (!include_copy_b && config.need_compensations()) entries.emplace_back(brgemm_expr->get_input_port(2), false); const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; mark_m_blocking(loop_manager, loop_begin, std::next(brgemm_it), entries, exits, m_block); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp index 22429a6b0c98fb..41d7346d2a3490 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp @@ -39,9 +39,13 @@ class BrgemmCPUBlocking : public ov::snippets::lowered::pass::BrgemmBlocking& n, + const std::shared_ptr& factory) : BufferExpression(n, factory) {} + +snippets::lowered::ExpressionPtr RepackedActivationsBufferExpression::clone() const { + return std::shared_ptr(new RepackedActivationsBufferExpression(*this)); +} + +void RepackedActivationsBufferExpression::validate() const { + BufferExpression::validate(); + OPENVINO_ASSERT(get_input_count() == 1, "RepackedActivationsBufferExpression must have only one input"); + const auto& parent_out = get_input_port_connector(0)->get_source(); + OPENVINO_ASSERT(ov::is_type(parent_out.get_expr()->get_node()) && parent_out.get_index() == 0, + "RepackedActivationsBufferExpression expects BrgemmCopyA as parent expression"); +} + +void RepackedActivationsBufferExpression::init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) { + const auto& parent_port = get_input_port_connector(0)->get_source(); + const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(parent_port); + const auto& k_dim = parent_port.get_descriptor_ptr()->get_shape().back(); + + const size_t k_blk = brgemm_utils::repacking::compute_inner_k_block(get_node()->get_input_element_type(0)); + const size_t m_blk = *++in_subtensor.rbegin(); + + m_allocation_size = snippets::utils::dynamic_safe_mul(m_blk, snippets::utils::rnd_up(k_dim, k_blk)); +} + RepackedWeightsBufferExpression::RepackedWeightsBufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory) : BufferExpression(n, factory) {} diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_buffer_expressions.hpp similarity index 69% rename from src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.hpp rename to src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_buffer_expressions.hpp index b85e75c55da30b..46605b630ec54e 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_buffer_expressions.hpp @@ -9,6 +9,21 @@ namespace ov { namespace intel_cpu { +class RepackedActivationsBufferExpression : public snippets::lowered::BufferExpression { + friend class snippets::lowered::ExpressionFactory; +public: + OPENVINO_RTTI("RepackedActivationsBufferExpression", "0", BufferExpression) + RepackedActivationsBufferExpression() = default; + + void validate() const override; + void init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) override; + +private: + RepackedActivationsBufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory); + + snippets::lowered::ExpressionPtr clone() const override; +}; + class RepackedWeightsBufferExpression : public snippets::lowered::BufferExpression { friend class snippets::lowered::ExpressionFactory; public: diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp index bd8dd12bd39256..dc14dc1388d82b 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp @@ -7,8 +7,9 @@ #include "snippets/lowered/loop_manager.hpp" #include "snippets/itt.hpp" +#include "transformations/snippets/x64/op/brgemm_copy_a.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" -#include "expressions/brgemm_copy_b_buffer_expressions.hpp" +#include "expressions/brgemm_copy_buffer_expressions.hpp" using namespace ov::intel_cpu::brgemm_utils::repacking; @@ -23,28 +24,31 @@ bool InsertBrgemmCopyBBuffers::run(LinearIR& linear_ir, LinearIR::constExprIt be const auto& factory = linear_ir.get_expr_factory(); - auto insert_buffer = [&](const ExpressionPtr& copy_b_expr, size_t out_port, LinearIR::constExprIt insertion_pos) { - const auto& copy_b = ov::as_type_ptr(copy_b_expr->get_node()); - const auto& copy_b_out = copy_b_expr->get_output_port_connector(out_port); - const auto copy_b_consumers = copy_b_out->get_consumers(); - OPENVINO_ASSERT(copy_b_consumers.size() == 1, "BufferCopyB must have only one consumer on each out port - Brgemm"); - const auto& buffer_op = std::make_shared(copy_b->output(out_port)); + auto insert_buffer = [&](const ExpressionPtr& expr, size_t out_port, LinearIR::constExprIt insertion_pos) { + const auto& out = expr->get_output_port_connector(out_port); + const auto consumers = out->get_consumers(); + OPENVINO_ASSERT(consumers.size() == 1, "BrgemmCopyA and BrgemmCopyB must have only one consumer on each out port - Buffer"); + const auto& buffer_op = std::make_shared(expr->get_node()->output(out_port)); BufferExpressionPtr buffer_expr = nullptr; - if (out_port == 0) { - buffer_expr = factory->build(buffer_op, {copy_b_out}); - } else if (out_port == 1 && with_compensations(copy_b->get_type())) { - buffer_expr = factory->build(buffer_op, {copy_b_out}); - } else { - OPENVINO_THROW("BrgemmCopyB has incorrect output ports"); + if (ov::is_type(expr->get_node())) { + OPENVINO_ASSERT(out_port == 0, "BrgemmCopyA must have only one output!"); + buffer_expr = factory->build(buffer_op, {out}); + } else if (const auto brgemm_copy_b_expr = ov::as_type_ptr(expr->get_node())) { + if (out_port == 0) { + buffer_expr = factory->build(buffer_op, {out}); + } else if (out_port == 1 && brgemm_copy_b_expr->get_config().need_compensations()) { + buffer_expr = factory->build(buffer_op, {out}); + } else { + OPENVINO_THROW("BrgemmCopyB has incorrect output ports"); + } } - return linear_ir.insert_expr(buffer_expr, LoopManager::get_common_outer_loops(copy_b_expr, copy_b_consumers.begin()->get_expr()), - true, insertion_pos, {copy_b_consumers}); + return linear_ir.insert_expr(buffer_expr, LoopManager::get_common_outer_loops(expr, consumers.begin()->get_expr()), true, insertion_pos, {consumers}); }; bool modified = false; for (auto expr_it = begin; expr_it != end; ++expr_it) { const auto expr = *expr_it; - if (auto copy_b = ov::as_type_ptr(expr->get_node())) { + if (ov::is_type(expr->get_node()) || ov::is_type(expr->get_node())) { for (size_t i = 0; i < expr->get_output_count(); ++i) { expr_it = insert_buffer(expr, i, std::next(expr_it)); } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp index 00657d9ec04387..5d32ff877f3379 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/shape_inference.cpp @@ -4,6 +4,7 @@ #include "shape_inference.hpp" #include +#include "op/brgemm_copy_a.hpp" #include "op/brgemm_copy_b.hpp" #include "op/brgemm_cpu.hpp" #include "transformations/snippets/common/op/fused_mul_add.hpp" @@ -57,6 +58,7 @@ const CPUShapeInferSnippetsFactory::TRegistry CPUShapeInferSnippetsFactory::spec SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::tpp::op::ReduceSum, ReduceShapeInfer), #endif SHAPE_INFER_OP_SPECIFIC_EXTERNAL(ov::intel_cpu::BrgemmCPU, BrgemmShapeInfer), + SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyA), SHAPE_INFER_OP_SPECIFIC(ov::intel_cpu::BrgemmCopyB), }; #undef SHAPE_INFER_OP_SPECIFIC diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index c4e5af875323ae..7a3840ecfdb872 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -515,12 +515,12 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*smoke_RDFT_CPU_1D/RDFTTestCPU.CompareWithRefs/prec=f32_IS0=\[\]_TS0=\(\(126\)\)_constAxes=true_axes=\(\(0\)\)_isInverse=false.*)"); retVector.emplace_back(R"(.*smoke_RDFT_CPU_2D/RDFTTestCPU.CompareWithRefs/prec=f32_IS0=\[\]_TS0=\(\(16.38\)\)_constAxes=true_axes=\(\(0.1\)\)_isInverse=false.*)"); #endif - if (!ov::with_cpu_x86_avx512_core()) { - // on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives, - // tests are useless on such platforms - retVector.emplace_back(R"(.*(BF|bf)16.*)"); - retVector.emplace_back(R"(.*bfloat16.*)"); - } + //if (!ov::with_cpu_x86_avx512_core()) { + // // on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives, + // // tests are useless on such platforms + // retVector.emplace_back(R"(.*(BF|bf)16.*)"); + // retVector.emplace_back(R"(.*bfloat16.*)"); + //} if (!ov::with_cpu_x86_avx2()) { // MatMul in Snippets uses BRGEMM that is supported only on AVX2 (and newer) platforms // Disabled Snippets MHA tests as well because MHA pattern contains MatMul @@ -542,16 +542,16 @@ std::vector disabledTestPatterns() { R"(.*EltwiseLayerCPUTest.*IS=\(\[1\.\.10\.2\.5\.6\]_\).*eltwiseOpType=SqDiff.*_configItem=INFERENCE_PRECISION_HINT=f16.*)"); } #endif - if (!ov::with_cpu_x86_avx512_core_vnni() && - !ov::with_cpu_x86_avx2_vnni() && - !ov::with_cpu_x86_avx512_core_amx_int8()) { - // MatMul in Snippets uses BRGEMM that supports i8 only on platforms with VNNI or AMX instructions - retVector.emplace_back(R"(.*Snippets.*MatMulFQ.*)"); - retVector.emplace_back(R"(.*Snippets.*MatMul.*Quantized.*)"); - retVector.emplace_back(R"(.*Snippets.*MHAFQ.*)"); - retVector.emplace_back(R"(.*Snippets.*MHAINT8.*)"); - retVector.emplace_back(R"(.*Snippets.*MHAQuant.*)"); - } + //if (!ov::with_cpu_x86_avx512_core_vnni() && + // !ov::with_cpu_x86_avx2_vnni() && + // !ov::with_cpu_x86_avx512_core_amx_int8()) { + // // MatMul in Snippets uses BRGEMM that supports i8 only on platforms with VNNI or AMX instructions + // retVector.emplace_back(R"(.*Snippets.*MatMulFQ.*)"); + // retVector.emplace_back(R"(.*Snippets.*MatMul.*Quantized.*)"); + // retVector.emplace_back(R"(.*Snippets.*MHAFQ.*)"); + // retVector.emplace_back(R"(.*Snippets.*MHAINT8.*)"); + // retVector.emplace_back(R"(.*Snippets.*MHAQuant.*)"); + //} if (!ov::with_cpu_x86_avx512_core_amx_int8()) // TODO: Issue 92895 // on platforms which do not support AMX, we are disabling I8 input tests @@ -563,12 +563,12 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*smoke_Snippets_MHAEnforceBF16.*)"); } // [150842] Need to support dynamic K dimension of BF16|INT8 MatMul on AMX systems - if (ov::with_cpu_x86_avx512_core_amx()) { - retVector.emplace_back(R"(.*smoke_Snippets_MatMul/MatMul.CompareWithRefImpl/.*IS\[0\]=\[2.2.70.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MatMul/MatMul.CompareWithRefImpl/.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MatMulTransposeB.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MatMulBias.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); - } + //if (ov::with_cpu_x86_avx512_core_amx()) { + // retVector.emplace_back(R"(.*smoke_Snippets_MatMul/MatMul.CompareWithRefImpl/.*IS\[0\]=\[2.2.70.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); + // retVector.emplace_back(R"(.*smoke_Snippets_MatMul/MatMul.CompareWithRefImpl/.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); + // retVector.emplace_back(R"(.*smoke_Snippets_MatMulTransposeB.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); + // retVector.emplace_back(R"(.*smoke_Snippets_MatMulBias.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); + //} #ifdef SNIPPETS_LIBXSMM_TPP // GN in TPP requires exposing tmp Buffer results outside the loop (ticket: 151234) retVector.emplace_back(R"(.*smoke_Snippets_GroupNormalization.*)"); diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp index 89f2e06c14a9fa..4d4a0d7e68ab26 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp @@ -23,7 +23,6 @@ using namespace ov::intel_cpu; using namespace ov::snippets::lowered; using namespace ov::snippets::lowered::pass; using namespace ov::snippets; -using BRGEMM_TYPE = intel_cpu::brgemm_utils::BRGEMM_TYPE; namespace { enum class BACKEND_TYPE{CPU, TPP}; @@ -118,6 +117,19 @@ void create_brgemm_with_copy_b_loop_infos(const LinearIRPtr& linear_ir, BrgemmBlockingBase::get_default_blocking_loop_handlers(m, m_block))); } } + +static brgemm_utils::BrgemmConfig brgemm_config_default() { + return brgemm_utils::BrgemmConfig(ov::element::f32, dnnl::impl::cpu::x64::cpu_isa_t::avx512_core); +} +static brgemm_utils::BrgemmConfig brgemm_config_only_repacking(const ov::element::Type& src_dt) { + return brgemm_utils::BrgemmConfig(src_dt, dnnl::impl::cpu::x64::cpu_isa_t::avx512_core, false, true, false, false); +} +static brgemm_utils::BrgemmConfig brgemm_config_with_comps() { + return brgemm_utils::BrgemmConfig(ov::element::i8, dnnl::impl::cpu::x64::cpu_isa_t::avx512_core_vnni, false, true, true, false); +} +static brgemm_utils::BrgemmConfig brgemm_config_amx(const ov::element::Type& src_dt) { + return brgemm_utils::BrgemmConfig(src_dt, dnnl::impl::cpu::x64::cpu_isa_t::avx512_core_amx, false, true, false, true); +} } // namespace class BrgemmBlockingTest : public LoweredPassTestsF { @@ -154,11 +166,12 @@ TEST_F(BrgemmCPUBlockingTest, Floating) { const VectorDims layout_a{0, 2, 1, 3}; const VectorDims layout_b{0, 2, 3, 1}; const VectorDims layout_c{0, 2, 1, 3}; + const auto brgemm_config = brgemm_config_default(); { auto data_a = linear_ir->push_node(precision, input_shape_a); auto data_b = linear_ir->push_node(precision, input_shape_b); - auto brgemm = linear_ir->push_node(data_a.second, data_b.second, BRGEMM_TYPE::STAND_ALONE, + auto brgemm = linear_ir->push_node(data_a.second, data_b.second, brgemm_config, 0, 0, 0, layout_a, layout_b, layout_c); init_expr_descriptors(*brgemm.first, {}, {layout_a, layout_b, layout_c}); auto result = linear_ir->push_node(brgemm.second); @@ -166,7 +179,7 @@ TEST_F(BrgemmCPUBlockingTest, Floating) { { auto data_a = linear_ir_ref->push_node(precision, input_shape_a); auto data_b = linear_ir_ref->push_node(precision, input_shape_b); - auto brgemm = linear_ir_ref->push_node(data_a.second, data_b.second, BRGEMM_TYPE::STAND_ALONE, + auto brgemm = linear_ir_ref->push_node(data_a.second, data_b.second, brgemm_config, 0, 0, 0, layout_a, layout_b, layout_c); const auto& brgemm_expr = *brgemm.first; init_expr_descriptors(brgemm_expr, {{m_blk, k_blk}, {k_blk, n_blk}, {m_blk, n_blk}}, {layout_a, layout_b, layout_c}); @@ -183,19 +196,20 @@ TEST_F(BrgemmCPUBlockingTest, Floating_LargeK) { const ov::PartialShape input_shape_a{1, 16, m, k}; const ov::PartialShape input_shape_b{1, 16, k, n}; const auto precision = ov::element::f32; + const auto brgemm_config = brgemm_config_default(); k_blk = 1024; { auto data_a = linear_ir->push_node(precision, input_shape_a); auto data_b = linear_ir->push_node(precision, input_shape_b); - auto brgemm = linear_ir->push_node(data_a.second, data_b.second, BRGEMM_TYPE::STAND_ALONE); + auto brgemm = linear_ir->push_node(data_a.second, data_b.second, brgemm_config); init_expr_descriptors(*brgemm.first, {}); auto result = linear_ir->push_node(brgemm.second); } { auto data_a = linear_ir_ref->push_node(precision, input_shape_a); auto data_b = linear_ir_ref->push_node(precision, input_shape_b); - auto brgemm = linear_ir_ref->push_node(data_a.second, data_b.second, BRGEMM_TYPE::STAND_ALONE); + auto brgemm = linear_ir_ref->push_node(data_a.second, data_b.second, brgemm_config); const auto& brgemm_expr = *brgemm.first; init_expr_descriptors(brgemm_expr, {{m_blk, k_blk}, {k_blk, n_blk}, {m_blk, n_blk}}); create_brgemm_loop_infos(linear_ir_ref, brgemm_expr, m, m_blk, k, k_blk, n, n_blk); @@ -211,18 +225,19 @@ TEST_F(BrgemmCPUBlockingTest, BlockingIsNotNeeded) { const ov::PartialShape input_shape_a{1, 16, m, k}; const ov::PartialShape input_shape_b{1, 16, k, n}; const auto precision = ov::element::f32; + const auto brgemm_config = brgemm_config_default(); { auto data_a = linear_ir->push_node(precision, input_shape_a); auto data_b = linear_ir->push_node(precision, input_shape_b); - auto brgemm = linear_ir->push_node(data_a.second, data_b.second, BRGEMM_TYPE::STAND_ALONE); + auto brgemm = linear_ir->push_node(data_a.second, data_b.second, brgemm_config); init_expr_descriptors(*brgemm.first); auto result = linear_ir->push_node(brgemm.second); } { auto data_a = linear_ir_ref->push_node(precision, input_shape_a); auto data_b = linear_ir_ref->push_node(precision, input_shape_b); - auto brgemm = linear_ir_ref->push_node(data_a.second, data_b.second, BRGEMM_TYPE::STAND_ALONE); + auto brgemm = linear_ir_ref->push_node(data_a.second, data_b.second, brgemm_config); const auto full_subtensor = VectorDims(2, ov::snippets::utils::get_full_dim_value()); init_expr_descriptors(*brgemm.first, std::vector(3, full_subtensor)); auto result = linear_ir_ref->push_node(brgemm.second); @@ -237,25 +252,26 @@ TEST_F(BrgemmCPUBlockingTest, WithDataRepacking) { const ov::PartialShape input_shape_b{1, 16, k, n}; const auto precision_a = ov::element::u8; const auto precision_b = ov::element::i8; + const auto brgemm_config = brgemm_config_only_repacking(precision_a); { auto data_a = linear_ir->push_node(precision_a, input_shape_a); auto data_b = linear_ir->push_node(precision_b, input_shape_b); - auto copy_b = linear_ir->push_node(data_b.second, precision_a, BRGEMM_TYPE::REPACKING_ONLY); + auto copy_b = linear_ir->push_node(data_b.second, precision_a, brgemm_config); init_expr_descriptors(*copy_b.first); - auto brgemm = linear_ir->push_node(data_a.second, copy_b.second, BRGEMM_TYPE::REPACKING_ONLY); + auto brgemm = linear_ir->push_node(data_a.second, copy_b.second, brgemm_config); init_expr_descriptors(*brgemm.first); auto result = linear_ir->push_node(brgemm.second); } { auto data_a = linear_ir_ref->push_node(precision_a, input_shape_a); auto data_b = linear_ir_ref->push_node(precision_b, input_shape_b); - auto copy_b = linear_ir_ref->push_node(data_b.second, precision_a, BRGEMM_TYPE::REPACKING_ONLY); + auto copy_b = linear_ir_ref->push_node(data_b.second, precision_a, brgemm_config); const auto copy_b_expr = *copy_b.first; init_expr_descriptors(copy_b_expr, {{full_dim, full_dim}, {full_dim, full_dim}}); - auto brgemm = linear_ir_ref->push_node(data_a.second, copy_b.second, BRGEMM_TYPE::REPACKING_ONLY); + auto brgemm = linear_ir_ref->push_node(data_a.second, copy_b.second, brgemm_config); const auto& brgemm_expr = *brgemm.first; init_expr_descriptors(brgemm_expr, {{m_blk, full_dim}, {full_dim, full_dim}, {m_blk, full_dim}}); create_brgemm_with_copy_b_loop_infos(linear_ir_ref, brgemm_expr, copy_b_expr, m, m_blk); @@ -271,26 +287,27 @@ TEST_F(BrgemmCPUBlockingTest, WithCompensations) { const ov::PartialShape input_shape_a{1, 16, m, k}; const ov::PartialShape input_shape_b{1, 16, k, n}; const auto precision = ov::element::i8; + const auto brgemm_config = brgemm_config_with_comps(); { auto data_a = linear_ir->push_node(precision, input_shape_a); auto data_b = linear_ir->push_node(precision, input_shape_b); - auto copy_b = linear_ir->push_node(data_b.second, precision, BRGEMM_TYPE::WITH_COMPENSATIONS); + auto copy_b = linear_ir->push_node(data_b.second, precision, brgemm_config); init_expr_descriptors(*copy_b.first); const auto& copy_b_n = copy_b.second; - auto brgemm = linear_ir->push_node(data_a.second, copy_b_n->output(0), copy_b_n->output(1), BRGEMM_TYPE::WITH_COMPENSATIONS); + auto brgemm = linear_ir->push_node(data_a.second, copy_b_n->output(0), copy_b_n->output(1), brgemm_config); init_expr_descriptors(*brgemm.first); auto result = linear_ir->push_node(brgemm.second); } { auto data_a = linear_ir_ref->push_node(precision, input_shape_a); auto data_b = linear_ir_ref->push_node(precision, input_shape_b); - auto copy_b = linear_ir_ref->push_node(data_b.second, precision, BRGEMM_TYPE::WITH_COMPENSATIONS); + auto copy_b = linear_ir_ref->push_node(data_b.second, precision, brgemm_config); const auto copy_b_expr = *copy_b.first; init_expr_descriptors(copy_b_expr, {{full_dim, full_dim}, {full_dim, full_dim}, {1, full_dim}}); const auto& copy_b_n = copy_b.second; - auto brgemm = linear_ir_ref->push_node(data_a.second, copy_b_n->output(0), copy_b_n->output(1), BRGEMM_TYPE::WITH_COMPENSATIONS); + auto brgemm = linear_ir_ref->push_node(data_a.second, copy_b_n->output(0), copy_b_n->output(1), brgemm_config); const auto& brgemm_expr = *brgemm.first; init_expr_descriptors(brgemm_expr, {{m_blk, full_dim}, {full_dim, full_dim}, {1, full_dim}, {m_blk, full_dim}}); create_brgemm_loop_infos(linear_ir_ref, brgemm_expr, m, m_blk); @@ -306,28 +323,29 @@ TEST_F(BrgemmCPUBlockingTest, AMX) { const ov::PartialShape input_shape_a{1, 16, m, k}; const ov::PartialShape input_shape_b{1, 16, k, n}; const auto precision = ov::element::bf16; + const auto brgemm_config = brgemm_config_amx(precision); { auto data_a = linear_ir->push_node(precision, input_shape_a); auto data_b = linear_ir->push_node(precision, input_shape_b); auto scratch = linear_ir->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); - auto copy_b = linear_ir->push_node(data_b.second, precision, BRGEMM_TYPE::REPACKING_ONLY); + auto copy_b = linear_ir->push_node(data_b.second, precision, brgemm_config); init_expr_descriptors(*copy_b.first); - auto brgemm = linear_ir->push_node(data_a.second, copy_b.second, scratch.second, BRGEMM_TYPE::WITH_AMX); + auto brgemm = linear_ir->push_node(data_a.second, copy_b.second, scratch.second, brgemm_config); init_expr_descriptors(*brgemm.first); auto result = linear_ir->push_node(brgemm.second); } { auto data_a = linear_ir_ref->push_node(precision, input_shape_a); auto data_b = linear_ir_ref->push_node(precision, input_shape_b); - auto copy_b = linear_ir_ref->push_node(data_b.second, precision, BRGEMM_TYPE::REPACKING_ONLY); + auto copy_b = linear_ir_ref->push_node(data_b.second, precision, brgemm_config); const auto copy_b_expr = *copy_b.first; init_expr_descriptors(copy_b_expr, {{full_dim, full_dim}, {full_dim, full_dim}}); auto scratch = linear_ir_ref->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); scratch.first->get()->set_loop_ids({0}); - auto brgemm = linear_ir_ref->push_node(data_a.second, copy_b.second, scratch.second, BRGEMM_TYPE::WITH_AMX); + auto brgemm = linear_ir_ref->push_node(data_a.second, copy_b.second, scratch.second, brgemm_config); const auto& brgemm_expr = *brgemm.first; init_expr_descriptors(brgemm_expr, {{m_blk, full_dim}, {full_dim, full_dim}, get_default_subtensor(), {m_blk, full_dim}}); create_brgemm_with_copy_b_loop_infos(linear_ir_ref, brgemm_expr, copy_b_expr, m, m_blk); diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp index 6dad1d4772f531..0c4aa39b4b0461 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp @@ -28,7 +28,6 @@ namespace ov { namespace test { namespace snippets { -using BRGEMM_TYPE = intel_cpu::brgemm_utils::BRGEMM_TYPE; /* Note[74841]: * This test is almost full copy of BufferAllocationTest class from openvino/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp. @@ -150,7 +149,8 @@ class MHAFP32BufferAllocationTest : public BufferAllocationCPUTest { const auto load_reshape = std::make_shared(parameter1, 1, 0, order); const auto store = std::make_shared(load_reshape); const auto relu0 = std::make_shared(store); - const auto brgemm_cpu0 = std::make_shared(parameter0, relu0, BRGEMM_TYPE::STAND_ALONE); + const auto brgemm_config = intel_cpu::brgemm_utils::BrgemmConfig(ov::element::f32, dnnl::impl::cpu::x64::cpu_isa_t::avx512_core); + const auto brgemm_cpu0 = std::make_shared(parameter0, relu0, brgemm_config); const auto relu1 = std::make_shared(brgemm_cpu0); @@ -165,7 +165,7 @@ class MHAFP32BufferAllocationTest : public BufferAllocationCPUTest { const auto power = std::make_shared(reduce_sum, -1.f); const auto multiply = std::make_shared(exp, power); - const auto brgemm_cpu1 = std::make_shared(multiply, parameter2, BRGEMM_TYPE::STAND_ALONE); + const auto brgemm_cpu1 = std::make_shared(multiply, parameter2, brgemm_config); const auto relu2 = std::make_shared(brgemm_cpu1); @@ -208,10 +208,11 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto relu0 = std::make_shared(convert0); const auto convert1 = std::make_shared(relu0, ov::element::bf16); - const auto brgemm_copyb0 = std::make_shared(convert1, ov::element::bf16); + const auto brgemm_config0 = intel_cpu::brgemm_utils::BrgemmConfig(ov::element::bf16, dnnl::impl::cpu::x64::cpu_isa_t::avx512_core_amx, + false, true, false, true); + const auto brgemm_copyb0 = std::make_shared(convert1, ov::element::bf16, brgemm_config0); const auto scratch0 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); - const auto brgemm_cpu0 = std::make_shared( - parameter0, brgemm_copyb0->output(0), scratch0, BRGEMM_TYPE::WITH_AMX); + const auto brgemm_cpu0 = std::make_shared(parameter0, brgemm_copyb0->output(0), scratch0, brgemm_config0); const auto relu1 = std::make_shared(brgemm_cpu0); @@ -228,10 +229,12 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto convert2 = std::make_shared(multiply, ov::element::bf16); - const auto brgemm_copyb1 = std::make_shared(parameter2, ov::element::bf16); + + const auto brgemm_config1 = intel_cpu::brgemm_utils::BrgemmConfig(ov::element::bf16, dnnl::impl::cpu::x64::cpu_isa_t::avx512_core_amx, + false, true, false, true); + const auto brgemm_copyb1 = std::make_shared(parameter2, ov::element::bf16, brgemm_config1); const auto scratch1 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); - const auto brgemm_cpu1 = std::make_shared( - convert2, brgemm_copyb1->output(0), scratch1, BRGEMM_TYPE::WITH_AMX); + const auto brgemm_cpu1 = std::make_shared(convert2, brgemm_copyb1->output(0), scratch1, brgemm_config1); const auto relu2 = std::make_shared(brgemm_cpu1); diff --git a/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp b/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp index 133959fd9fdc6b..db661117cd5c5c 100644 --- a/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp +++ b/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp @@ -444,9 +444,9 @@ class Error { << " Diff: " << std::fabs(val.expected_value - val.actual_value) << " calculated_abs_threshold: " << val.threshold << " abs_threshold: " << abs_threshold << " rel_threshold: " << rel_threshold << "\n"; -#ifdef NDEBUG +//#ifdef NDEBUG break; -#endif +//#endif } throw std::runtime_error(msg); } else if (!less_or_equal(mvn_results, mvn_threshold)) {