From 4f0d37db2c66c487c97959d7182694f9065d8802 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 2 Sep 2024 11:30:07 +0400 Subject: [PATCH] [Snippets] Created BufferExpression [Snippets][CPU] Implemented BrgemmCopyB specific buffers [Snippets] Fixed build --- .../include/snippets/lowered/expression.hpp | 38 ++++- .../snippets/lowered/expression_factory.hpp | 85 ++++++----- .../lowered/expressions/buffer_expression.hpp | 68 +++++++++ .../include/snippets/lowered/linear_ir.hpp | 37 ++++- .../pass/compute_buffer_allocation_size.hpp | 7 +- .../lowered/pass/define_buffer_clusters.hpp | 17 ++- .../lowered/pass/propagate_buffer_offset.hpp | 2 +- .../lowered/pass/set_buffer_reg_group.hpp | 14 +- .../lowered/pass/solve_buffer_memory.hpp | 9 +- .../lowered/pass/validate_buffers.hpp | 29 ++++ .../snippets/include/snippets/op/buffer.hpp | 79 ++-------- .../include/snippets/runtime_configurator.hpp | 2 +- src/common/snippets/src/generator.cpp | 5 +- .../snippets/src/lowered/expression.cpp | 133 +++++++++++++--- .../src/lowered/expression_factory.cpp | 113 ++++++-------- .../lowered/expressions/buffer_expression.cpp | 143 ++++++++++++++++++ src/common/snippets/src/lowered/linear_ir.cpp | 64 ++++++-- .../src/lowered/pass/allocate_buffers.cpp | 2 +- .../src/lowered/pass/assign_registers.cpp | 29 ++-- .../pass/clean_repeated_ptr_shifts.cpp | 16 +- .../pass/compute_buffer_allocation_size.cpp | 108 +------------ .../lowered/pass/define_buffer_clusters.cpp | 63 ++++---- .../src/lowered/pass/init_buffers_default.cpp | 24 ++- .../snippets/src/lowered/pass/init_loops.cpp | 4 +- .../src/lowered/pass/insert_buffers.cpp | 4 +- .../src/lowered/pass/insert_load_store.cpp | 4 +- .../pass/insert_specific_iterations.cpp | 20 ++- .../pass/normalize_buffer_reg_groups.cpp | 16 +- .../lowered/pass/propagate_buffer_offset.cpp | 34 ++--- .../src/lowered/pass/set_buffer_reg_group.cpp | 74 ++++----- .../src/lowered/pass/solve_buffer_memory.cpp | 52 +++---- .../snippets/src/lowered/pass/validate.cpp | 39 +---- .../src/lowered/pass/validate_buffers.cpp | 68 +++++++++ src/common/snippets/src/op/buffer.cpp | 124 ++++++++------- .../snippets/src/op/serialization_node.cpp | 78 +--------- src/common/snippets/src/op/subgraph.cpp | 2 + .../snippets/src/runtime_configurator.cpp | 14 +- .../src/shape_inference/shape_inference.cpp | 3 +- .../src/lowered/pass/buffer_allocation.cpp | 12 +- .../snippets/tests/src/lowering_utils.cpp | 3 +- .../snippets/aarch64/jit_kernel_emitter.cpp | 24 +-- .../emitters/snippets/x64/cpu_generator.cpp | 3 +- .../snippets/x64/jit_brgemm_emitter.cpp | 2 +- .../snippets/x64/jit_kernel_emitter.cpp | 24 +-- .../snippets/x64/jit_memory_emitters.cpp | 4 +- src/plugins/intel_cpu/src/extension.cpp | 3 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 7 +- .../snippets/x64/op/brgemm_cpu.cpp | 2 +- .../snippets/x64/op/brgemm_utils.cpp | 40 ----- .../snippets/x64/op/brgemm_utils.hpp | 12 -- .../x64/pass/brgemm_to_brgemm_cpu.cpp | 2 +- .../x64/pass/lowered/brgemm_cpu_blocking.cpp | 4 +- .../lowered/insert_brgemm_copy_b_buffers.cpp | 140 +++++++++++++++++ .../lowered/insert_brgemm_copy_b_buffers.hpp | 65 ++++++++ .../set_brgemm_copy_b_buffers_shape.cpp | 43 ------ .../set_brgemm_copy_b_buffers_shape.hpp | 31 ---- .../x64/lowered/brgemm_blocking.cpp | 4 +- .../x64/lowered/buffer_allocation.cpp | 16 +- 58 files changed, 1156 insertions(+), 908 deletions(-) create mode 100644 src/common/snippets/include/snippets/lowered/expressions/buffer_expression.hpp create mode 100644 src/common/snippets/include/snippets/lowered/pass/validate_buffers.hpp create mode 100644 src/common/snippets/src/lowered/expressions/buffer_expression.cpp create mode 100644 src/common/snippets/src/lowered/pass/validate_buffers.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp delete mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp delete mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp diff --git a/src/common/snippets/include/snippets/lowered/expression.hpp b/src/common/snippets/include/snippets/lowered/expression.hpp index a04368e5605435..befbaeb3c526d6 100644 --- a/src/common/snippets/include/snippets/lowered/expression.hpp +++ b/src/common/snippets/include/snippets/lowered/expression.hpp @@ -17,15 +17,18 @@ namespace ov { namespace snippets { namespace lowered { +class ExpressionFactory; class LinearIR; using ExpressionPtr = std::shared_ptr; using ExpressionMap = std::unordered_map; class Expression : public std::enable_shared_from_this { friend class LinearIR; + friend class ExpressionFactory; friend class ExpressionPort; public: Expression() = default; + virtual ~Expression() = default; std::shared_ptr get_node() const; std::shared_ptr get_emitter() const; @@ -50,7 +53,8 @@ class Expression : public std::enable_shared_from_this { void set_input_port_connector(size_t port, PortConnectorPtr to); - void validate() const; + // Cannot be called in ctor because validate port attributes (descs, connectors) also + virtual void validate() const; ExpressionPort get_input_port(size_t i); ExpressionPort get_output_port(size_t i); @@ -61,16 +65,42 @@ class Expression : public std::enable_shared_from_this { bool needShapeInfer() const { return m_need_shape_infer; } const std::vector& get_loop_ids() const; void set_loop_ids(const std::vector& loops); - ExpressionPtr clone_with_new_inputs(const std::vector& new_inputs, - const std::shared_ptr& new_node) const; + ExpressionPtr clone_with_new_inputs(const std::shared_ptr& new_node, const std::vector& new_inputs, + const std::vector& new_in_descs = {}) const; ExpressionPtr clone_with_new_inputs(const ExpressionMap& expr_map, const std::shared_ptr& new_node) const; + virtual bool visit_attributes(AttributeVisitor &visitor); + + // Note that get_type_info_static and get_type_info are needed to mimic OPENVINO_RTTI interface, + // so the standard OPENVINO_RTTI(...) macros could be used in derived classes. + _OPENVINO_HIDDEN_METHOD static const ::ov::DiscreteTypeInfo& get_type_info_static() { + static ::ov::DiscreteTypeInfo type_info_static {"Expression"}; + type_info_static.hash(); + return type_info_static; + } + + virtual const DiscreteTypeInfo& get_type_info() const { + return get_type_info_static(); + } + + const char* get_type_name() const { + return get_type_info().name; + } + protected: Expression(const Expression& other); // Note: The constructor initialization is private since an expression can be created only by Linear IR. // The method must be used only by Linear IR builder of expressions! Expression(const std::shared_ptr& n, const std::shared_ptr& factory, bool need_shape_infer = true); - void update_node_and_connectors(const std::vector& new_inputs, const std::shared_ptr& new_node); + + // Virtual clone method wich is called in clone_with_new_inputs with common logic + virtual ExpressionPtr clone() const; + // Called in ctors to validate expression attributes + virtual void validate_attributes() const; + + // used in clone_with_new_inputs. New output port descriptors were inited automatically + void update_port_attributes(const std::shared_ptr& new_node, const std::vector& new_inputs, + const std::vector& new_in_descs, const std::vector& new_out_descs); std::shared_ptr m_source_node{nullptr}; std::shared_ptr m_emitter{nullptr}; diff --git a/src/common/snippets/include/snippets/lowered/expression_factory.hpp b/src/common/snippets/include/snippets/lowered/expression_factory.hpp index ca45fe936e0500..d617eb3d03b410 100644 --- a/src/common/snippets/include/snippets/lowered/expression_factory.hpp +++ b/src/common/snippets/include/snippets/lowered/expression_factory.hpp @@ -4,65 +4,72 @@ #pragma once -#include "linear_ir.hpp" +#include "expression.hpp" +#include "expressions/buffer_expression.hpp" -#include "snippets/snippets_isa.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/op/buffer.hpp" +#include "snippets/op/perf_count.hpp" namespace ov { namespace snippets { namespace lowered { -class LinearIR::ExpressionFactory { +class ExpressionFactory { public: - template - static ExpressionPtr build(const std::shared_ptr& n, Args&&... params) { - if (const auto par = ov::as_type_ptr(n)) { - return create(par, params...); - } else if (const auto res = ov::as_type_ptr(n)) { - return create(res, params...); - } else if (const auto loop_begin = ov::as_type_ptr(n)) { - return create(loop_begin, params...); - } else if (const auto loop_end = ov::as_type_ptr(n)) { - return create(loop_end, params...); -#ifdef SNIPPETS_DEBUG_CAPS - } else if (const auto perf_counter = ov::as_type_ptr(n)) { - return create(perf_counter, params...); - } else if (const auto perf_counter = ov::as_type_ptr(n)) { - return create(perf_counter, params...); -#endif - } - return create(n, params...); + ExpressionFactory(std::shared_ptr shape_infer_factory) + : m_shape_infer_factory(std::move(shape_infer_factory)) {} + + template ::value, bool>::type = true> + std::shared_ptr build(const std::shared_ptr& n, const std::vector& inputs, Args... args) { + return create(n, inputs, m_shape_infer_factory, args...); } private: - /* -- Default Builders - initialize input port connectors from parents and create new output port connectors themselves */ - static ExpressionPtr create(const std::shared_ptr& par, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& res, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, const LinearIR& linear_ir); - - /* -- Input Builders - get input port connectors from method parameters and create new output port connectors themselves */ - static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); + static ExpressionPtr create(const std::shared_ptr& par, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create(const std::shared_ptr& res, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); // Note: PerfCountBegin nodes have a PerfCountEnd ov::Output, but corresponding expression should not have any outputs to avoid register allocation #ifdef SNIPPETS_DEBUG_CAPS - static ExpressionPtr create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir); - static ExpressionPtr create_without_connections(const std::shared_ptr& n, const LinearIR& linear_ir); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create_without_connections(const std::shared_ptr& n, const std::shared_ptr& shape_infer_factory); #endif - // Creates inputs for expression using parent output port connectors - static void create_expression_inputs(const LinearIR& linear_ir, const ExpressionPtr& expr); + template ::value, bool>::type = true> + static std::shared_ptr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory, Args... args) { + auto expr = std::shared_ptr(new T(n, shape_infer_factory, args...)); + init_expression_inputs(expr, inputs); + create_expression_outputs(expr); + expr->validate(); + // todo: here we blindly synchronize input shapes from parent and child. Remove this when shapes will be stored in port connector itself + if (shape_infer_factory) + expr->updateShapes(); + return expr; + } + // Creates new output port connectors static void create_expression_outputs(const ExpressionPtr& expr); // The method verifies of input port connectors to availability of the expression as consumer and add it if missed static void init_expression_inputs(const ExpressionPtr& expr, const std::vector& inputs); + + const std::shared_ptr m_shape_infer_factory = nullptr; }; +using ExpressionFactoryPtr = std::shared_ptr; + +template<> +std::shared_ptr ExpressionFactory::build(const std::shared_ptr& n, const std::vector& inputs); } // namespace lowered } // namespace snippets diff --git a/src/common/snippets/include/snippets/lowered/expressions/buffer_expression.hpp b/src/common/snippets/include/snippets/lowered/expressions/buffer_expression.hpp new file mode 100644 index 00000000000000..94fdf1c8dcdc1a --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/expressions/buffer_expression.hpp @@ -0,0 +1,68 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/expression.hpp" + +#include "snippets/utils/utils.hpp" + + +namespace ov { +namespace snippets { +namespace lowered { + +// To avoid cycle-dependancy of includes, we forward-declare LoopManager +class LoopManager; +/** + * @interface BufferExpression + * @brief This is a base class for memory storage. + * Notes that Buffer should be a single consumer for operation output port + * @param m_allocation_size - memory size for allocation in bytes. Dynamic value means undefined size. + * @param m_offset - offset in common Buffer scratchpad + * @param m_reg_group - number of register group. The Buffers from the same register group will have the same GPR + * @param m_cluster_id - number of cluster. The Buffers from the same cluster shares memory between them and will have the same offset. + * @ingroup snippets + */ +class BufferExpression : public Expression { + friend class ExpressionFactory; +public: + OPENVINO_RTTI("BufferExpression", "0", Expression) + BufferExpression() = default; + + bool visit_attributes(AttributeVisitor &visitor) override; + + size_t get_reg_group() const { return m_reg_group; } + size_t get_cluster_id() const { return m_cluster_id; } + size_t get_offset() const { return m_offset; } + size_t get_allocation_size() const { return m_allocation_size; } + size_t get_byte_size() const; + + void set_reg_group(size_t reg_group) { m_reg_group = reg_group; } + void set_cluster_id(size_t cluster) { m_cluster_id = cluster; } + void set_allocation_size(size_t size) { m_allocation_size = size; } + void set_offset(size_t offset) { m_offset = offset; } + + virtual void init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank); + + // Returns True, if allocation size is known. Otherwise returns False - allocation size is undefined + bool is_defined() const; + +protected: + BufferExpression(const BufferExpression& other); + BufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory); + + ExpressionPtr clone() const override; + void validate_attributes() const override; + + size_t m_allocation_size = utils::get_dynamic_value(); + size_t m_reg_group = 0; + size_t m_cluster_id = 0; + size_t m_offset = utils::get_dynamic_value(); +}; +using BufferExpressionPtr = std::shared_ptr; + +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index 55afd2c9ccd7ab..6038b608a76ff7 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -7,6 +7,8 @@ #include #include "snippets/lowered/expression.hpp" +#include "snippets/lowered/expression_factory.hpp" +#include "snippets/lowered/expressions/buffer_expression.hpp" #include "snippets/target_machine.hpp" #include "snippets/shape_inference/shape_inference.hpp" #ifdef SNIPPETS_DEBUG_CAPS @@ -51,9 +53,12 @@ using LoopManagerPtr = std::shared_ptr; */ class LinearIR { friend class LinearIRBuilder; - class ExpressionFactory; public: - using container = std::list; + template ::value, bool>::type = true> + using containerT = std::list>; + using container = containerT; + using buffers = containerT; using exprIt = container::iterator; using constExprIt = container::const_iterator; using exprReverseIt = container::reverse_iterator; @@ -62,12 +67,12 @@ class LinearIR { LinearIR(Config config = {}, const std::shared_ptr& factory = {}); LinearIR(const std::shared_ptr& m, const std::shared_ptr& factory, Config config = {}); - ExpressionPtr create_expression(const std::shared_ptr& n, const std::vector& inputs) const; + const ExpressionFactoryPtr& get_expr_factory() const; const container& get_ops() const { return m_expressions; } - const container& get_buffers() const { return m_buffer_expressions; } const container& get_parameters() const { return m_parameter_expressions; } const container& get_results() const { return m_result_expressions; } + const buffers& get_buffers() const { return m_buffer_expressions; } const Config& get_config() const { return m_config; } size_t get_static_buffer_scratchpad_size() const { return m_static_buffer_scratchpad_size; } @@ -186,6 +191,20 @@ class LinearIR { return std::make_pair(expr_it, node); } + /** + * @brief Insert new Expression to LinearIR, sets `loops_ids` as loop identifiers and inserts the expression on the `place` in LinearIR. + * Also connects output ports to `consumers` + * @param new_expr the target expr which were created by ExpressionFactory + * @param loop_ids vector of loops ids that will be set for the expression + * @param update_loop_ports true - the helpers updates the corresponding loop ports after insertion otherwise - skip + * @param place before this place expression will be inserted + * @param consumers vector of expression port sets. These expression ports will be consumers of the expression. + * The vector may be empty or size of vector must be equal to output port count + * @return new expression iterator in LinearIR + */ + exprIt insert_expr(const ExpressionPtr& new_expr, const std::vector& loop_ids, + bool update_loop_ports, const constExprIt& place, const std::vector>& consumers); + /** * @brief Replace the several existing expressions with the one new expression that contains `new_node`. * Calls the helper `insert_node` and performs substitution: removes `old_exprs`. @@ -258,11 +277,12 @@ class LinearIR { }; static ov::NodeVector get_ordered_ops(const std::shared_ptr& model); - // Default way: expr port connectors are constructed basing on ov::Node connection - ExpressionPtr create_expression(const std::shared_ptr& n); ExpressionPtr create_expression(const std::shared_ptr& n, const std::vector& new_inputs, const std::vector& loop_ids, bool update_loop_ports, const std::vector>& consumers = {}); + // Creates inputs for expression using parent output port connectors + std::vector get_expression_inputs_by_node(const std::shared_ptr& n) const; + void register_expression(const ExpressionPtr& expr, bool io_allowed, double exec_num); void unregister_expression(const ExpressionPtr& expr); @@ -273,11 +293,12 @@ class LinearIR { std::unordered_map, std::shared_ptr> m_node2expression_map; container m_parameter_expressions{}; container m_result_expressions{}; - container m_buffer_expressions{}; + buffers m_buffer_expressions{}; Config m_config{}; LoopManagerPtr m_loop_manager; - std::shared_ptr m_shape_infer_factory; + std::shared_ptr m_shape_infer_factory = nullptr; std::shared_ptr m_shape_infer = nullptr; + std::shared_ptr m_expression_factory = nullptr; bool m_is_dynamic = false; // Size of static Buffer Scratchpad (Buffers with defined allocation size) diff --git a/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp b/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp index 830956338ef4a1..01d8b3ee85261e 100644 --- a/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp @@ -22,14 +22,9 @@ namespace pass { class ComputeBufferAllocationSize : public RangedPass { public: OPENVINO_RTTI("ComputeBufferAllocationSize", "RangedPass") - ComputeBufferAllocationSize(size_t buffer_allocation_rank) : m_buffer_allocation_rank(buffer_allocation_rank) {} + ComputeBufferAllocationSize() = default; bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; - - static size_t get_allocation_size(const LoopManagerPtr& loop_manager, const ExpressionPtr& buffer_expr, size_t allocation_rank); - -private: - const size_t m_buffer_allocation_rank = 0; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp index 824b0d4daea75d..1597eaa2377a50 100644 --- a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp @@ -43,27 +43,27 @@ class DefineBufferClusters : public RangedPass { bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: - using BufferCluster = std::set; + using BufferCluster = std::set; using BufferClusters = std::vector; - using BufferPorts = std::unordered_map>; + using BufferPorts = std::unordered_map>; /** * @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer * @param target target expression with Buffer op * @return vector iterator which refers to the found cluster */ - BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target); + BufferClusters::iterator find_cluster_by_expr(const BufferExpressionPtr& target); /** * @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr) * @param buffer_expr expression with assumed Buffer op * @param target_expr expression with target op - LoopEnd or MemoryAccess op * @return boolean value */ - bool is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const; + bool is_direct_buffer(const BufferExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const; /** * @brief Creates new buffer cluster if buffer_exprs is missed in clusters. If buffer_exprs is already in clusters, do nothing * @param buffer_expr expression with Buffer op */ - void create_new_cluster(const ExpressionPtr& buffer_expr); + void create_new_cluster(const BufferExpressionPtr& buffer_expr); /** * @brief Returns common ID of cluster if all buffer inside have the same Buffer ID. Otherwise returns the default value SIZE_MAX * that means that Buffers in cluster have different IDs. @@ -106,7 +106,7 @@ class DefineBufferClusters : public RangedPass { * @param buffer_expr expression with Buffer op * @return finalization offset - int64_t value */ - int64_t get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const; + int64_t get_buffer_finalization_offset(const BufferExpressionPtr& buffer_expr) const; /** * @brief Check if two Buffer expressions are connected to the same Loop. Set common LoopEnd as `loop` parameter and * indexes of Loop ports `up_idx` and `down_idx` if Buffers are really neighbours @@ -117,7 +117,8 @@ class DefineBufferClusters : public RangedPass { * @param down_idx the reference to port index of lower Buffer op to the Loop * @return Return True if the Buffers are connected to the same Loop */ - static bool are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx); + static bool are_buffer_neighbours(const BufferExpressionPtr& up, const BufferExpressionPtr& down, ExpressionPtr& loop, + size_t& up_idx, size_t& down_idx); /** * @brief Unite clusters * @param inner_cluster_it iterator to inner cluster - buffer cluster is in the loop @@ -127,7 +128,7 @@ class DefineBufferClusters : public RangedPass { * @return Return True if clusters have been united */ bool unite_nested_clusters(const BufferClusters::iterator& inner_cluster_it, BufferCluster& outer_cluster, - const ExpressionPtr& outer_buffer, bool is_outer_up); + const BufferExpressionPtr& outer_buffer, bool is_outer_up); BufferClusters m_clusters; }; diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_buffer_offset.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_buffer_offset.hpp index a602569d793a55..d895b3a60cd26d 100644 --- a/src/common/snippets/include/snippets/lowered/pass/propagate_buffer_offset.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/propagate_buffer_offset.hpp @@ -34,7 +34,7 @@ class PropagateBufferOffset: public Pass { * @brief Propagates Buffer offset to the connected memory access ops * @param buffer_expr expression with Buffer op with inited offset */ - static void propagate(const ExpressionPtr& buffer_expr); + static void propagate(const BufferExpressionPtr& buffer_expr); }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp b/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp index 8faf2419a0a313..674e8e9964ac2c 100644 --- a/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp @@ -64,8 +64,8 @@ class SetBufferRegGroup: public RangedPass { static bool can_be_in_one_group(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs); private: - using BufferPool = std::vector; - using BufferMap = std::map; + using BufferPool = LinearIR::buffers; + using BufferMap = std::map; /** * @brief Get Buffer Index in Buffer set @@ -73,7 +73,7 @@ class SetBufferRegGroup: public RangedPass { * @param pool set of Buffers from the Linear IR * @return index of target Buffer expression in set */ - static size_t get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool); + static size_t get_buffer_idx(const BufferExpressionPtr& target, const BufferPool& pool); /** * @brief Create adjacency matrix for Buffer system. See comment in the method for more details. * @param linear_ir the target Linear IR @@ -99,8 +99,8 @@ class SetBufferRegGroup: public RangedPass { * @param buffers set of Buffers from the Linear IR * @param adj Target adjacency matrix */ - static void update_adj_matrix(const std::pair& lhs, - const std::pair& rhs, + static void update_adj_matrix(const std::pair& lhs, + const std::pair& rhs, const BufferPool& buffers, std::vector& adj); /** @@ -109,8 +109,8 @@ class SetBufferRegGroup: public RangedPass { * @param rhs Pair where first value is Expression with second Buffer and second value is data pointer shift params for it * @return Returns True if they are adjacent, otherwise returns False */ - static bool are_adjacent(const std::pair& lhs, - const std::pair& rhs); + static bool are_adjacent(const std::pair& lhs, + const std::pair& rhs); /** * @brief Find all buffers that are connected to the current LoopEnd diff --git a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp index 74f2994deec971..c3e6564f9bdfec 100644 --- a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp @@ -35,32 +35,33 @@ class SolveBufferMemory : public Pass { bool run(lowered::LinearIR& linear_ir) override; private: + using Buffers = LinearIR::buffers; /** * @brief Split buffer expressions of Linear IR into * static (with defined allocation size) and dynamic (with unknown size) buffers * @param buffer_expressions buffer expressions * @return the pair of static and dynamic buffer expressions */ - std::pair extract_static_and_dynamic_buffers(const LinearIR::container& buffer_expressions); + std::pair extract_static_and_dynamic_buffers(const Buffers& buffer_expressions); /** * @brief Initializes boxes for MemorySolver * @param buffer_expressions buffer expressions * @param linear_ir linear ir * @return vector of boxes for MemorySolver */ - std::vector init_boxes(const LinearIR::container& buffer_expressions, const LinearIR& linear_ir); + std::vector init_boxes(const Buffers& buffer_expressions, const LinearIR& linear_ir); /** * @brief Calculate memory size and set offset to buffer with defined allocation size * @param static_buffer_expressions static buffer expressions * @param linear_ir linear ir */ - void solve_static_buffer_memory(const LinearIR::container& static_buffer_expressions, const LinearIR& linear_ir); + void solve_static_buffer_memory(const Buffers& static_buffer_expressions, const LinearIR& linear_ir); /** * @brief Initialize offset for Buffer with undefined allocation size * Note: should be called after `solve_static_buffer_memory` * @param dynamic_buffer_expressions dynamic buffer expressions */ - void set_dynamic_buffer_offset(const LinearIR::container& dynamic_buffer_expressions); + void set_dynamic_buffer_offset(const Buffers& dynamic_buffer_expressions); size_t& m_static_buffer_scratchpad_size; diff --git a/src/common/snippets/include/snippets/lowered/pass/validate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/validate_buffers.hpp new file mode 100644 index 00000000000000..b87697d054e4fb --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/validate_buffers.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +/** + * @interface ValidateBuffers + * @brief The pass validates buffer expression in Linear IR state + * @ingroup snippets + */ +class ValidateBuffers : public RangedPass { +public: + OPENVINO_RTTI("ValidateBuffers", "Pass") + ValidateBuffers() = default; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp index e990a31d28b6c0..8a2f9680d9ac56 100644 --- a/src/common/snippets/include/snippets/op/buffer.hpp +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -15,93 +15,44 @@ namespace op { /** * @interface Buffer * @brief This is a base class for memory storage. - * Notes: - * - All buffers with the same reg_group in a graph have the same memory pointer. So if we have a few buffers, - * each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer - * - Buffer should be a single consumer for operation output port - * @param m_allocation_size - memory size for allocation in bytes. Dynamic value means undefined size. - * @param m_offset - offset in common Buffer scratchpad - * @param m_reg_group - number of register group. The Buffers from the same register group will have the same GPR - * @param m_cluster_id - number of cluster. The Buffers from the same cluster shares memory between them and will have the same offset. * @ingroup snippets */ class Buffer : public ov::op::Op { + enum class Type { + NewMemory, + IntermediateMemory + }; + public: OPENVINO_OP("Buffer", "SnippetsOpset"); Buffer() = default; - Buffer(const OutputVector& arguments, size_t allocation_size = utils::get_dynamic_value(), size_t reg_group = 0, size_t cluster_id = 0); + Buffer(const ov::Output& arg); + Buffer(const OutputVector& arguments); + Buffer(const ov::Shape& shape, ov::element::Type element_type = ov::element::u8); bool visit_attributes(AttributeVisitor& visitor) override; - size_t get_reg_group() const { return m_reg_group; } - size_t get_cluster_id() const { return m_cluster_id; } - size_t get_offset() const { return m_offset; } - size_t get_allocation_size() const { return m_allocation_size; } - size_t get_byte_size() const; - - void set_reg_group(size_t reg_group) { m_reg_group = reg_group; } - void set_cluster_id(size_t cluster) { m_cluster_id = cluster; } - void set_allocation_size(size_t allocation_size) { m_allocation_size = allocation_size; } - void set_offset(size_t offset) { m_offset = offset; } - - // Returns True, if allocation size is known. Otherwise returns False - allocation size is undefined - bool is_defined() const; - -protected: - size_t m_allocation_size = utils::get_dynamic_value(); - size_t m_reg_group = 0; - size_t m_cluster_id = 0; - size_t m_offset = utils::get_dynamic_value(); -}; - -/** - * @interface IntermediateMemoryBuffer - * @brief Represents an intermediate memory storage operation. It always has a parent. - * @ingroup snippets - * - */ -class IntermediateMemoryBuffer : public Buffer { -public: - OPENVINO_OP("IntermediateMemoryBuffer", "SnippetsOpset", Buffer); - IntermediateMemoryBuffer() = default; - IntermediateMemoryBuffer(const OutputVector& arguments, size_t allocation_size = utils::get_dynamic_value(), - size_t reg_group = 0, size_t cluster_id = 0); - IntermediateMemoryBuffer(const ov::Output& arg, size_t allocation_size = utils::get_dynamic_value(), - size_t reg_group = 0, size_t cluster_id = 0); - void validate_and_infer_types() override; - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; -}; -/** - * @interface NewMemoryBuffer - * @brief Represents a new empty memory for allocation with specified shape. It has no parent operations. - * @ingroup snippets - * - */ -class NewMemoryBuffer : public Buffer { -public: - OPENVINO_OP("NewMemoryBuffer", "SnippetsOpset", Buffer); - NewMemoryBuffer() = default; - NewMemoryBuffer(const ov::Shape& shape, size_t reg_group = 0, size_t cluster_id = 0, ov::element::Type element_type = ov::element::u8); - - void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - void set_element_type(ov::element::Type element_type); + size_t get_allocation_size() const; class ShapeInfer : public IShapeInferSnippets { ov::Shape m_shape; + Type m_type; public: explicit ShapeInfer(const std::shared_ptr& n); Result infer(const std::vector& input_shapes) override; }; -private: - ov::Shape m_output_shape; - ov::element::Type m_element_type = ov::element::u8; // u8 - default 1 byte +protected: + const Type m_type = Type::NewMemory; + const ov::Shape m_output_shape {}; + const ov::element::Type m_element_type = ov::element::u8; // u8 - default 1 byte }; + } // namespace op } // namespace snippets } // namespace ov diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 169d63ee4baa92..660871b890b49b 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -218,7 +218,7 @@ class RuntimeConfigurator { std::vector m_io_descs = {}; std::vector m_io_data_sizes = {}; // [cluster_id -> buffer expressions ] - std::map> m_dynamic_buffer_clusters = {}; + std::map> m_dynamic_buffer_clusters = {}; std::vector m_ordered_loop_ids = {}; std::vector m_latest_shapes = {}; diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 7ba5e830fd3362..d76545e0a5ba40 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -29,7 +29,7 @@ LoweringResult Generator::generate(const lowered::LinearIRPtr& linear_ir, const const auto kernel_op = op::Kernel::make_kernel(*linear_ir); kernel_op->compile_params = compile_params; - const auto kernel_expr = linear_ir->create_expression(kernel_op, std::vector{}); + const auto kernel_expr = linear_ir->get_expr_factory()->build<>(kernel_op, std::vector{}); const auto kernel = target->get(kernel_expr->get_node()->get_type_info())(kernel_expr); kernel->emit_code({}, {}); @@ -74,8 +74,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output& out) const { std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) diff --git a/src/common/snippets/src/lowered/expression.cpp b/src/common/snippets/src/lowered/expression.cpp index 3c4391da3a7250..01d301fdb86063 100644 --- a/src/common/snippets/src/lowered/expression.cpp +++ b/src/common/snippets/src/lowered/expression.cpp @@ -25,23 +25,20 @@ Expression::Expression(const std::shared_ptr& n, const std::shared_ptroutputs()) { m_output_port_descriptors.push_back(PortDescriptorUtils::get_port_descriptor_ptr(output)); } + validate_attributes(); } Expression::Expression(const Expression& other) : std::enable_shared_from_this(other), m_source_node(other.m_source_node), m_emitter(other.m_emitter), m_loop_ids(other.m_loop_ids), m_shapeInference(other.m_shapeInference), m_need_shape_infer(other.m_need_shape_infer), m_exec_num(other.m_exec_num) { - auto clone_ports_descriptors = [](const std::vector& src, std::vector& dst) { - dst.resize(src.size()); - for (size_t i = 0; i < src.size(); i++) - dst[i] = src[i]->clone(); - }; - clone_ports_descriptors(other.m_input_port_descriptors, m_input_port_descriptors); - clone_ports_descriptors(other.m_output_port_descriptors, m_output_port_descriptors); + m_input_port_descriptors = {}; + m_output_port_descriptors = {}; // Note that connectors are not filled on purpose, since you need a shared pointer to this to initialize them, // which is not available in constructor. Also, an expression copy is rarely expected to use the same connectors. m_input_port_connectors = {}; m_output_port_connectors = {}; + validate_attributes(); } const PortConnectorPtr& Expression::get_input_port_connector(size_t i) const { @@ -96,13 +93,17 @@ void Expression::set_reg_info(const RegInfo& rinfo) { } } + void Expression::validate_attributes() const { + OPENVINO_ASSERT(m_source_node != nullptr, + "The expression has null source node"); + } + void Expression::validate() const { + validate_attributes(); OPENVINO_ASSERT(m_input_port_descriptors.size() == m_input_port_connectors.size(), "The count of input ports and input port connectors must be equal"); OPENVINO_ASSERT(m_output_port_descriptors.size() == m_output_port_connectors.size(), "The count of output ports and output port connectors must be equal"); - OPENVINO_ASSERT(m_source_node != nullptr, - "The expression has null source node"); } void Expression::set_input_port_connector(size_t port, PortConnectorPtr to) { @@ -130,13 +131,12 @@ void Expression::set_loop_ids(const std::vector& loops) { m_loop_ids = loops; } -void Expression::update_node_and_connectors(const std::vector& new_inputs, - const std::shared_ptr& new_node) { - OPENVINO_ASSERT(m_source_node->get_type_info() == new_node->get_type_info(), - "Can't clone expression for a new node with incompatible type"); +void Expression::update_port_attributes(const std::shared_ptr& new_node, const std::vector& new_inputs, + const std::vector& new_in_descs, const std::vector& new_out_descs) { + OPENVINO_ASSERT(m_source_node->get_type_info() == new_node->get_type_info(), "Can't clone expression for a new node with incompatible type"); m_source_node = new_node; - OPENVINO_ASSERT(new_inputs.size() == m_input_port_descriptors.size(), - "Can't create Expression with new inputs: invalid number of input port connectors passed"); + OPENVINO_ASSERT(new_inputs.size() == new_in_descs.size(), "Can't create Expression with new inputs: invalid number of input port connectors passed"); + m_input_port_descriptors = new_in_descs; m_input_port_connectors = new_inputs; for (size_t i = 0; i < m_input_port_descriptors.size(); i++) { const auto& i_con = new_inputs[i]; @@ -144,16 +144,27 @@ void Expression::update_node_and_connectors(const std::vector& if (!i_con->found_consumer(i_port)) i_con->add_consumer(i_port); } + m_output_port_descriptors = new_out_descs; m_output_port_connectors.resize(m_output_port_descriptors.size()); for (size_t i = 0; i < m_output_port_descriptors.size(); i++) { m_output_port_connectors[i] = std::make_shared(get_output_port(i)); } } -ExpressionPtr Expression::clone_with_new_inputs(const std::vector& new_inputs, - const std::shared_ptr& new_node) const { - const auto& expr = std::shared_ptr(new Expression(*this)); - expr->update_node_and_connectors(new_inputs, new_node); +ExpressionPtr Expression::clone_with_new_inputs(const std::shared_ptr& new_node, + const std::vector& new_inputs, + const std::vector& new_in_descs) const { + auto clone_ports_descriptors = [](const std::vector& src) { + std::vector dst(src.size()); + for (size_t i = 0; i < src.size(); i++) + dst[i] = src[i]->clone(); + return dst; + }; + const auto& expr = clone(); + const auto& in_descs = !new_in_descs.empty() ? new_in_descs : clone_ports_descriptors(m_input_port_descriptors); + const auto& out_descs = clone_ports_descriptors(m_output_port_descriptors); + expr->update_port_attributes(new_node, new_inputs, in_descs, out_descs); + expr->validate(); return expr; } @@ -171,7 +182,89 @@ ExpressionPtr Expression::clone_with_new_inputs(const ExpressionMap& expr_map, new_inputs.emplace_back(input); } } - return clone_with_new_inputs(new_inputs, new_node); + return clone_with_new_inputs(new_node, new_inputs); +} + +ExpressionPtr Expression::clone() const { + return std::shared_ptr(new Expression(*this)); +} + +bool Expression::visit_attributes(AttributeVisitor &visitor) { + auto is_planar_layout = [](const std::vector& layout) { + for (size_t i = 0; i < layout.size(); ++i) + if (layout[i] != i) return false; + return true; + }; + auto subtensor2str = [](const VectorDims& subtensor) { + std::stringstream ss; + for (size_t i = 0; i < subtensor.size(); ++i) { + const auto& v = subtensor[i]; + const auto v_str = utils::is_full_dim_value(v) ? "FULL_DIM" : + utils::is_dynamic_value(v) ? "?" : std::to_string(v); + const auto del = i < subtensor.size() - 1 ? ", " : ""; + ss << v_str << del; + } + return ss.str(); + }; + + std::vector in_regs, out_regs; + std::vector in_reg_types, out_reg_types; + std::vector> shapes; + std::vector> subtensors; + std::vector>> layouts; + for (size_t i = 0; i < get_input_count(); i++) { + const auto& desc = m_input_port_descriptors[i]; + const auto& shape = desc->get_shape(); + if (!shape.empty()) + shapes.emplace_back("in_shape_" + std::to_string(i), ov::PartialShape(shape)); + + const auto& subtensor = desc->get_subtensor(); + if (!subtensor.empty()) + subtensors.emplace_back("in_subtensor_" + std::to_string(i), subtensor2str(subtensor)); + + const auto& layout = desc->get_layout(); + if (!layout.empty() && !is_planar_layout(layout)) + layouts.emplace_back("in_layout_" + std::to_string(i), layout); + + in_reg_types.emplace_back(regTypeToStr(desc->get_reg().type)); + in_regs.emplace_back(desc->get_reg().idx); + } + for (size_t i = 0; i < get_output_count(); i++) { + const auto& desc = m_output_port_descriptors[i]; + const auto& shape = desc->get_shape(); + if (!shape.empty()) + shapes.emplace_back("out_shape_" + std::to_string(i), ov::PartialShape(shape)); + + const auto& subtensor = desc->get_subtensor(); + if (!subtensor.empty()) + subtensors.emplace_back("out_subtensor_" + std::to_string(i), subtensor2str(subtensor)); + + const auto& layout = desc->get_layout(); + if (!layout.empty() && !is_planar_layout(layout)) + layouts.emplace_back("out_layout_" + std::to_string(i), layout); + + out_reg_types.emplace_back(regTypeToStr(desc->get_reg().type)); + out_regs.emplace_back(desc->get_reg().idx); + } + + if (!in_regs.empty()) { + visitor.on_attribute("in_regs", in_regs); + visitor.on_attribute("in_reg_types", in_reg_types); + } + if (!out_regs.empty()) { + visitor.on_attribute("out_regs", out_regs); + visitor.on_attribute("out_reg_types", out_reg_types); + } + for (auto& s : shapes) + visitor.on_attribute(s.first, s.second); + for (auto& s : subtensors) + visitor.on_attribute(s.first, s.second); + for (auto& s : layouts) + visitor.on_attribute(s.first, s.second); + visitor.on_attribute("loop_ids", m_loop_ids); + visitor.on_attribute("execution_number", m_exec_num); + m_source_node->visit_attributes(visitor); + return true; } ExpressionPort Expression::get_input_port(size_t i) { diff --git a/src/common/snippets/src/lowered/expression_factory.cpp b/src/common/snippets/src/lowered/expression_factory.cpp index da60f9ac701b5f..c6ba395909c9f3 100644 --- a/src/common/snippets/src/lowered/expression_factory.cpp +++ b/src/common/snippets/src/lowered/expression_factory.cpp @@ -10,22 +10,29 @@ namespace ov { namespace snippets { namespace lowered { -void LinearIR::ExpressionFactory::create_expression_inputs(const LinearIR& linear_ir, const ExpressionPtr& expr) { - OPENVINO_ASSERT(expr != nullptr, "Failed expression inputs creation: expression is null"); - const auto& node = expr->get_node(); - - expr->m_input_port_connectors.resize(node->get_input_size(), nullptr); - for (const auto& input : node->inputs()) { - const auto input_source = input.get_source_output(); - const auto in_index = input.get_index(); - const auto& parent_expr = linear_ir.get_expr_by_node(input_source.get_node_shared_ptr()); - const auto& port_connector = parent_expr->get_output_port_connector(input_source.get_index()); - port_connector->add_consumer(expr->get_input_port(in_index)); - expr->m_input_port_connectors[in_index] = port_connector; +template<> +std::shared_ptr ExpressionFactory::build(const std::shared_ptr& n, const std::vector& inputs) { + if (const auto par = ov::as_type_ptr(n)) { + return create(par, inputs, m_shape_infer_factory); + } else if (const auto res = ov::as_type_ptr(n)) { + return create(res, inputs, m_shape_infer_factory); + } else if (const auto loop_begin = ov::as_type_ptr(n)) { + return create(loop_begin, inputs, m_shape_infer_factory); + } else if (const auto loop_end = ov::as_type_ptr(n)) { + return create(loop_end, inputs, m_shape_infer_factory); + } else if (const auto buffer = ov::as_type_ptr(n)) { + return create(buffer, inputs, m_shape_infer_factory); +#ifdef SNIPPETS_DEBUG_CAPS + } else if (const auto perf_counter = ov::as_type_ptr(n)) { + return create(perf_counter, inputs, m_shape_infer_factory); + } else if (const auto perf_counter = ov::as_type_ptr(n)) { + return create(perf_counter, inputs, m_shape_infer_factory); +#endif } + return create<>(n, inputs, m_shape_infer_factory); } -void LinearIR::ExpressionFactory::create_expression_outputs(const ExpressionPtr& expr) { +void ExpressionFactory::create_expression_outputs(const ExpressionPtr& expr) { OPENVINO_ASSERT(expr != nullptr, "Failed expression outputs creation: expression is null"); const auto& node = expr->get_node(); @@ -38,7 +45,7 @@ void LinearIR::ExpressionFactory::create_expression_outputs(const ExpressionPtr& } // The method verifies of input port connectors to availability of the expression as consumer and add it if missed -void LinearIR::ExpressionFactory::init_expression_inputs(const ExpressionPtr& expr, const std::vector& inputs) { +void ExpressionFactory::init_expression_inputs(const ExpressionPtr& expr, const std::vector& inputs) { for (size_t i = 0; i < inputs.size(); ++i) { const auto& input = inputs[i]; const auto consumers = input->get_consumers(); @@ -53,18 +60,21 @@ void LinearIR::ExpressionFactory::init_expression_inputs(const ExpressionPtr& ex expr->m_input_port_connectors = inputs; } -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& par, const LinearIR& linear_ir) { +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& par, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { + OPENVINO_ASSERT(inputs.empty(), "Parameter cannot have inputs"); // Note: ctor of shared_ptr isn't friend class for Expression -> we cannot use directly make_shared(args) - auto expr = std::shared_ptr(new Expression(par, linear_ir.m_shape_infer_factory, false)); + auto expr = std::shared_ptr(new Expression(par, shape_infer_factory, false)); create_expression_outputs(expr); expr->validate(); return expr; } -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& res, const LinearIR& linear_ir) { +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& res, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { // Note: ctor of shared_ptr isn't friend class for Expression -> we cannot use directly make_shared(args) - auto expr = std::shared_ptr(new Expression(res, linear_ir.m_shape_infer_factory)); - create_expression_inputs(linear_ir, expr); + auto expr = std::shared_ptr(new Expression(res, shape_infer_factory)); + init_expression_inputs(expr, inputs); // The Result node don't need output port (because of sense of the node). But each node in openvino must have one output at least. // The port descriptors are automatically created in constructor. We manually clean output ports. expr->m_output_port_descriptors.clear(); @@ -72,31 +82,19 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, const LinearIR& linear_ir) { - OPENVINO_ASSERT(!ov::is_type(n), "Default expression builder doesn't support LoopBegin and LoopEnd"); - // Note: ctor of shared_ptr isn't friend class for Expression - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory)); - create_expression_inputs(linear_ir, expr); - create_expression_outputs(expr); - expr->validate(); - return expr; -} - -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { OPENVINO_ASSERT(inputs.empty(), "LoopBegin cannot have inputs"); - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory, false)); + auto expr = std::shared_ptr(new Expression(n, shape_infer_factory, false)); init_expression_inputs(expr, inputs); create_expression_outputs(expr); expr->validate(); return expr; } -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory, false)); +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { + auto expr = std::shared_ptr(new Expression(n, shape_infer_factory, false)); expr->m_input_port_descriptors.resize(inputs.size(), nullptr); for (size_t i = 0; i < inputs.size() - 1; ++i) { expr->m_input_port_descriptors[i] = std::make_shared(); @@ -113,23 +111,22 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - OPENVINO_ASSERT(inputs.empty(), "PerfCountBegin factory do not accept any input connectors"); - return create_without_connections(n, linear_ir); +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { + OPENVINO_ASSERT(inputs.empty(), "PerfCountBegin shape_infer_factory do not accept any input connectors"); + return create_without_connections(n, shape_infer_factory); } -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - OPENVINO_ASSERT(inputs.empty(), "PerfCountEnd factory do not accept any input connectors"); - return create_without_connections(n, linear_ir); +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& n, + const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { + OPENVINO_ASSERT(inputs.empty(), "PerfCountEnd shape_infer_factory do not accept any input connectors"); + return create_without_connections(n, shape_infer_factory); } -ExpressionPtr LinearIR::ExpressionFactory::create_without_connections(const std::shared_ptr& n, - const LinearIR& linear_ir) { - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory, false)); +ExpressionPtr ExpressionFactory::create_without_connections(const std::shared_ptr& n, + const std::shared_ptr& shape_infer_factory) { + auto expr = std::shared_ptr(new Expression(n, shape_infer_factory, false)); expr->m_input_port_descriptors.clear(); expr->m_output_port_descriptors.clear(); expr->validate(); @@ -137,22 +134,6 @@ ExpressionPtr LinearIR::ExpressionFactory::create_without_connections(const std: } #endif -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - OPENVINO_ASSERT(!ov::is_type(n) && - !ov::is_type(n), - "Expression builder with inputs doesn't support Result and Parameter"); - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory)); - init_expression_inputs(expr, inputs); - create_expression_outputs(expr); - expr->validate(); - // todo: here we blindly synchronize input shapes from parent and child. Remove this when shapes will be stored in - // port connector itself - if (linear_ir.m_shape_infer_factory) - expr->updateShapes(); - return expr; -} }// namespace lowered }// namespace snippets }// namespace ov diff --git a/src/common/snippets/src/lowered/expressions/buffer_expression.cpp b/src/common/snippets/src/lowered/expressions/buffer_expression.cpp new file mode 100644 index 00000000000000..7bf2b00da7d6ed --- /dev/null +++ b/src/common/snippets/src/lowered/expressions/buffer_expression.cpp @@ -0,0 +1,143 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + + +#include "snippets/lowered/expressions/buffer_expression.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/op/buffer.hpp" + + +namespace ov { +namespace snippets { +namespace lowered { + +BufferExpression::BufferExpression(const BufferExpression& other) + : Expression(other), m_allocation_size(other.m_allocation_size), m_reg_group(other.m_reg_group), + m_cluster_id(other.m_cluster_id), m_offset(other.m_offset) {} + +BufferExpression::BufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory) + : Expression(n, factory) { + const auto& buffer = ov::as_type_ptr(get_node()); + OPENVINO_ASSERT(buffer, "BufferExpression expects Buffer op"); + m_allocation_size = buffer->get_allocation_size(); +} + +ExpressionPtr BufferExpression::clone() const { + return std::shared_ptr(new BufferExpression(*this)); +} + +void BufferExpression::validate_attributes() const { + Expression::validate_attributes(); + OPENVINO_ASSERT(ov::is_type(get_node()), "BufferExpression expects Buffer op"); +} + +bool BufferExpression::visit_attributes(AttributeVisitor &visitor) { + auto allocation_size = utils::value2str(m_allocation_size); + auto offset = utils::value2str(m_offset); + visitor.on_attribute("allocation_size", allocation_size); + visitor.on_attribute("offset", offset); + visitor.on_attribute("reg_group", m_reg_group); + visitor.on_attribute("cluster_id", m_cluster_id); + return true; +} + +bool BufferExpression::is_defined() const { + return !utils::is_dynamic_value(m_allocation_size); +} + +size_t BufferExpression::get_byte_size() const { + if (is_defined()) + return m_allocation_size * get_node()->get_output_element_type(0).size(); + return utils::get_dynamic_value(); +} + +namespace { +std::vector get_parent_inner_loops(const std::vector& parent_loops, const std::vector& current_loops) { + const auto common_rank = std::min(parent_loops.size(), current_loops.size()); + size_t i = 0; + while (i < common_rank && parent_loops[i] == current_loops[i]) + ++i; + return std::vector(parent_loops.cbegin() + i, parent_loops.cend()); +} +} // namespace + +// Ticket: 113744 +// TODO: This logic covers only several specific cases so it should be generalized. +void BufferExpression::init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) { + // Note: Buffer expressions can have more than one parent after the loops splitting transformation, but only the last parent + // can be used to access valid loop ports. More info in the ticket: 146646 + const auto buffer_in_idx = get_input_count() - 1; + const auto& parent_port = get_input_port_connector(buffer_in_idx)->get_source(); + const auto& parent_loop_ids = get_parent_inner_loops(parent_port.get_expr()->get_loop_ids(), get_loop_ids()); + const auto planar_shape = utils::get_preordered_vdims(parent_port); + + const size_t rank = allocation_rank >= 0 ? std::min(static_cast(allocation_rank), planar_shape.size()) + : planar_shape.size(); + + const auto& subtensor = ov::snippets::utils::get_projected_subtensor(parent_port); + + auto hard_equal = [&parent_port](const LoopPort& port) { + return *port.expr_port == parent_port; + }; + auto soft_equal = [&](const LoopPort& loop_port) { + const auto& port = *loop_port.expr_port; + // Check semantic of LoopPort + if (parent_port.get_index() != port.get_index() || + port.get_expr()->get_node()->get_type_info() != parent_port.get_expr()->get_node()->get_type_info()) + return false; + // Check that this LoopPort is connected to the same by semantic Buffer + const auto consumers = port.get_connected_ports(); + for (const auto& consumer : consumers) { + if (const auto buffer_consumer = ov::as_type_ptr(consumer.get_expr())) { + if (buffer_consumer->get_cluster_id() == m_cluster_id && consumer.get_index() == buffer_in_idx) + return true; + } + } + return false; + }; + + m_allocation_size = 1; + std::set processed_dim_idxs; + for (const auto& parent_loop : parent_loop_ids) { + const auto loop_info = loop_manager->get_loop_info(parent_loop); + const auto& output_ports = loop_info->get_output_ports(); + auto it = std::find_if(output_ports.begin(), output_ports.end(), hard_equal); + // [149219] : Try to find original loop port if this LoopInfo is cloned after InsertSpecificIterations + // and ports are not mapped on the original ExpressionPorts + if (it == output_ports.end()) { + it = std::find_if(output_ports.begin(), output_ports.end(), soft_equal); + OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found"); + } + const auto& loop_port = *it; + const auto& dim_idx = loop_port.dim_idx; + if (loop_port.is_incremented && dim_idx < rank) { + if (const auto& unified_loop_info = ov::as_type_ptr(loop_info)) + m_allocation_size = utils::dynamic_safe_mul(m_allocation_size, unified_loop_info->get_work_amount()); + else if (const auto& expanded_loop_info = ov::as_type_ptr(loop_info)) + m_allocation_size = utils::dynamic_safe_mul(m_allocation_size, expanded_loop_info->get_unified_loop_info()->get_work_amount()); + else + OPENVINO_THROW("Unknown LoopInfo type"); + processed_dim_idxs.insert(dim_idx); + } + } + const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size(); + for (size_t i = 0; i < std::min(processing_rank, rank); ++i) { + if (processed_dim_idxs.count(i) == 0) { + const auto multiplier = i < subtensor.size() ? *(subtensor.rbegin() + i) : *(planar_shape.rbegin() + i); + m_allocation_size = utils::dynamic_safe_mul(m_allocation_size, multiplier); + } + } + + // Corner case when the current information is not enough + if (processing_rank == 0 && processed_dim_idxs.empty()) { + for (size_t i = 0; i < rank; ++i) { + m_allocation_size = utils::dynamic_safe_mul(m_allocation_size, *(planar_shape.rbegin() + i)); + } + } +} + +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp index 09640196b1fa17..6c5afadecb0285 100644 --- a/src/common/snippets/src/lowered/linear_ir.cpp +++ b/src/common/snippets/src/lowered/linear_ir.cpp @@ -25,7 +25,8 @@ LinearIR::LinearIR(Config config, const std::shared_ptr()), m_shape_infer_factory(factory), - m_shape_infer(std::make_shared(m_expressions, m_parameter_expressions, m_result_expressions)) {} + m_shape_infer(std::make_shared(m_expressions, m_parameter_expressions, m_result_expressions)), + m_expression_factory(std::make_shared(m_shape_infer_factory)) {} LinearIR::LinearIR(const std::shared_ptr& model, const std::shared_ptr& factory, @@ -34,7 +35,7 @@ LinearIR::LinearIR(const std::shared_ptr& model, constExprIt last_param = m_expressions.end(); for (const auto& n : get_ordered_ops(model)) { constExprIt insertion_pos = m_expressions.end(); - const auto expr = create_expression(n); + const auto expr = get_expr_factory()->build<>(n, get_expression_inputs_by_node(n)); // Scalar should be on the Linear IR beginning after Parameters to have valid expression order after Loop passes. // After these passes we must call pass MoveScalarToConsumer() to have a correct accuracy. @@ -43,8 +44,11 @@ LinearIR::LinearIR(const std::shared_ptr& model, insertion_pos = std::next(last_param); } - // exec_num = 0 since `insertion_pos` can be changed - register_expression(expr, true, 0); + // Some utils containers (for example, buffers) in Lir contain expressions in execution order + // so we have to pass exec order to registration. However, this enumeration is not optimal because + // the next each expr will has exec_num = prev_expr->exec_num + 1. + // For more efficient execution ordering we have to call "enumerate_expressions" in the end of LIR initialization + register_expression(expr, true, get_inserted_expr_exec_num(insertion_pos)); const auto& it = m_expressions.insert(insertion_pos, expr); if (ov::is_type(n)) last_param = it; @@ -57,12 +61,21 @@ LinearIR::LinearIR(const std::shared_ptr& model, enumerate_expressions(); } -ExpressionPtr LinearIR::create_expression(const std::shared_ptr& n) { - return ExpressionFactory::build(n, *this); +const ExpressionFactoryPtr& LinearIR::get_expr_factory() const { + OPENVINO_ASSERT(m_expression_factory, "ExpresstionFactory is missed!"); + return m_expression_factory; } -ExpressionPtr LinearIR::create_expression(const std::shared_ptr& n, const std::vector& inputs) const { - return ExpressionFactory::build(n, inputs, *this); +std::vector LinearIR::get_expression_inputs_by_node(const std::shared_ptr& n) const { + OPENVINO_ASSERT(n != nullptr, "Failed expression inputs getting: node is null"); + std::vector inputs(n->get_input_size(), nullptr); + for (const auto& input : n->inputs()) { + const auto input_source = input.get_source_output(); + const auto in_index = input.get_index(); + const auto& parent_expr = get_expr_by_node(input_source.get_node_shared_ptr()); + inputs[in_index] = parent_expr->get_output_port_connector(input_source.get_index()); + } + return inputs; } namespace { @@ -84,7 +97,7 @@ void update_consumers_and_regs(const ExpressionPtr& new_expr, const std::vector< ExpressionPtr LinearIR::create_expression(const std::shared_ptr& n, const std::vector& new_inputs, const std::vector& loop_ids, bool update_loop_ports, const std::vector>& consumers) { - const auto new_expr = create_expression(n, new_inputs); + const auto new_expr = get_expr_factory()->build<>(n, new_inputs); update_consumers_and_regs(new_expr, consumers); new_expr->set_loop_ids(loop_ids); @@ -178,13 +191,20 @@ void LinearIR::register_expression(const ExpressionPtr& expr, bool io_allowed, d "LinearIR::insert can't be used to add Parameters or Results to IR"); const auto& res = m_node2expression_map.insert({node, expr}); OPENVINO_ASSERT(res.second, "Duplicate node is detected in linear IR: ", node); + + expr->m_exec_num = exec_num; + if (ov::is_type(node)) m_parameter_expressions.push_back(expr); if (ov::is_type(node)) m_result_expressions.push_back(expr); - if (ov::is_type(node)) - m_buffer_expressions.push_back(expr); - expr->m_exec_num = exec_num; + if (const auto buffer_expr = ov::as_type_ptr(expr)) { + // just to align with execution order + auto it = m_buffer_expressions.cbegin(); + while (it != m_buffer_expressions.cend() && expr->m_exec_num > (*it)->get_exec_num()) + ++it; + m_buffer_expressions.insert(it, buffer_expr); + } } void LinearIR::unregister_expression(const ExpressionPtr& expr) { @@ -197,9 +217,9 @@ void LinearIR::unregister_expression(const ExpressionPtr& expr) { m_node2expression_map.erase(node); OPENVINO_ASSERT(!ov::is_type(node) && !ov::is_type(node), "unregister_expression mustn't be called for parameter or result expressions"); - if (ov::is_type(node)) { - const auto& it = std::find(m_buffer_expressions.cbegin(), m_buffer_expressions.cend(), expr); - OPENVINO_ASSERT(it != m_buffer_expressions.cend(), "Buffer Expression has not been found in the list of LinearIR Buffers!"); + if (const auto buffer_expr = ov::as_type_ptr(expr)) { + const auto& it = std::find(m_buffer_expressions.cbegin(), m_buffer_expressions.cend(), buffer_expr); + OPENVINO_ASSERT(it != m_buffer_expressions.cend(), "BufferExpression has not been found in the list of LinearIR Buffers!"); m_buffer_expressions.erase(it); } } @@ -245,7 +265,7 @@ LinearIR::exprIt LinearIR::insert(LinearIR::constExprIt pos, const NodeVector& n } LinearIR::exprIt LinearIR::insert(LinearIR::constExprIt pos, const std::shared_ptr& n) { - const auto& expr = create_expression(n); + const auto& expr = get_expr_factory()->build<>(n, get_expression_inputs_by_node(n)); register_expression(expr, m_config.m_manual_build_support, get_inserted_expr_exec_num(pos)); return m_expressions.insert(pos, expr); } @@ -338,6 +358,18 @@ LinearIR::exprIt LinearIR::insert_node(const std::shared_ptr& new_node return insert_node(new_node, new_inputs, loop_ids, update_loop_ports, place, consumers); } +LinearIR::exprIt LinearIR::insert_expr(const ExpressionPtr& new_expr, const std::vector& loop_ids, + bool update_loop_ports, const constExprIt& place, const std::vector>& consumers) { + update_consumers_and_regs(new_expr, consumers); + new_expr->set_loop_ids(loop_ids); + + const auto expr_it = insert(place, new_expr); + if (update_loop_ports) + get_loop_manager()->update_loop_ports(new_expr); + + return expr_it; +} + LinearIR::exprIt LinearIR::replace_with_node(const std::vector& old_exprs, const std::shared_ptr& new_node, const std::vector& loop_ids, const constExprIt& place) { OPENVINO_ASSERT(!old_exprs.empty(), "Failed to replace node: there are no old expressions for replacing"); diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp index d72e35ceac533b..f76c4097b38f38 100644 --- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp @@ -28,7 +28,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const size_t buffer_scratchpad_size = 0; PassPipeline pipeline; - pipeline.register_pass(linear_ir.get_config().m_loop_depth); + pipeline.register_pass(); if (m_is_optimized_mode) { pipeline.register_pass(); pipeline.register_pass(); diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp index e071460e5d85f1..2f921214bffed4 100644 --- a/src/common/snippets/src/lowered/pass/assign_registers.cpp +++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp @@ -84,25 +84,22 @@ bool AssignRegisters::run(LinearIR& linear_ir) { auto accumulator_reg = 0lu; for (const auto& expr : exprs) { auto op = expr->get_node(); - if (const auto& buffer = ov::as_type_ptr(op)) { - const auto reg_group = buffer->get_reg_group(); + if (const auto& buffer_expr = ov::as_type_ptr(expr)) { + const auto reg_group = buffer_expr->get_reg_group(); // All buffers have one common data pointer - if (ov::is_type(buffer)) { - const auto assigned_reg = num_results + num_parameters + reg_group; - for (const auto& input : expr->get_input_port_connectors()) { - manually_assigned_gprs[input] = static_cast(assigned_reg); - // shape infer ops in the middle of subgraph. IntermediateMemoryBuffer is inserted before reshape as new loop should start. - // child shape info ops share the same memory as IntermediateMemoryBuffer. - const auto& shape_infer_consumers = utils::get_first_child_shape_infer_expr_seq(expr); - for (const auto& child_shape_infer_expr : shape_infer_consumers) { - manually_assigned_gprs[child_shape_infer_expr->get_input_port_connector(0)] = - manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] = - static_cast(assigned_reg); - } + const auto assigned_reg = num_results + num_parameters + reg_group; + for (const auto& input : expr->get_input_port_connectors()) { + manually_assigned_gprs[input] = static_cast(assigned_reg); + // shape infer ops in the middle of subgraph. Buffer is inserted before reshape as new loop should start. + // child shape info ops share the same memory as Buffer. + const auto& shape_infer_consumers = utils::get_first_child_shape_infer_expr_seq(expr); + for (const auto& child_shape_infer_expr : shape_infer_consumers) { + manually_assigned_gprs[child_shape_infer_expr->get_input_port_connector(0)] = + manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] = + static_cast(assigned_reg); } } - manually_assigned_gprs[expr->get_output_port_connector(0)] = - static_cast(num_results + num_parameters + reg_group); + manually_assigned_gprs[expr->get_output_port_connector(0)] = static_cast(assigned_reg); } else if (ov::is_type(op) || ov::is_type(op)) { // Only in ReduceDecomposition Reduce ops use HorizonMax/HorizonSum and VectorBuffer. // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator diff --git a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp index 4cf201047d63f5..e0397b03224bc3 100644 --- a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp +++ b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp @@ -32,10 +32,10 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop std::set read_data_exprs; for (size_t i = 0; i < input_count; ++i) { const auto& parent_output = loop_connectors[i]->get_source().get_expr(); - if (const auto buffer = ov::as_type_ptr(parent_output->get_node())) { + if (const auto buffer_expr = ov::as_type_ptr(parent_output)) { // If Buffer is missed in set, Just save - it's first meeting - if (buffers_groups.count(buffer->get_reg_group()) == 0) { - buffers_groups.insert(buffer->get_reg_group()); + if (buffers_groups.count(buffer_expr->get_reg_group()) == 0) { + buffers_groups.insert(buffer_expr->get_reg_group()); } else { // The Buffer with the same ID is in set - need to add this Buffer idx to set of Buffers for resetting resetting_data_indexes.insert(i); @@ -56,17 +56,17 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop size_t buffer_count = 0; size_t loop_count = 0; for (const auto& consumer_input : consumer_inputs) { - const auto& child_node = consumer_input.get_expr()->get_node(); - if (const auto buffer = ov::as_type_ptr(child_node)) { + const auto& consumer = consumer_input.get_expr(); + if (const auto buffer_expr = ov::as_type_ptr(consumer)) { buffer_count++; // If Buffer is missed in set, Just save - it's first meeting - if (buffers_groups.count(buffer->get_reg_group()) == 0) { - buffers_groups.insert(buffer->get_reg_group()); + if (buffers_groups.count(buffer_expr->get_reg_group()) == 0) { + buffers_groups.insert(buffer_expr->get_reg_group()); } else { // The Buffer with the same ID is in set - need to add this Buffer idx to set of Buffers for resetting resetting_data_indexes.insert(input_count + i); } - } else if (ov::is_type(child_node)) { + } else if (ov::is_type(consumer->get_node())) { loop_count++; } } diff --git a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp index 85bbed324a9865..c6f0b9bcb936cb 100644 --- a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp +++ b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp @@ -14,112 +14,16 @@ namespace snippets { namespace lowered { namespace pass { -namespace { -std::vector get_parent_inner_loops(const std::vector& parent_loops, const std::vector& current_loops) { - const auto common_rank = std::min(parent_loops.size(), current_loops.size()); - size_t i = 0; - while (i < common_rank && parent_loops[i] == current_loops[i]) - ++i; - return std::vector(parent_loops.cbegin() + i, parent_loops.cend()); -} -} // namespace - -// Ticket: 113744 -// TODO: This logic covers only several specific cases so it should be generalized. -size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& loop_manager, const ExpressionPtr& buffer_expr, size_t allocation_rank) { - const auto& current_buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(current_buffer, "`get_allocation_size` expected Buffer"); - - // Note: Buffer expressions can have more than one parent after the loops splitting transformation, but only the last parent - // can be used to access valid loop ports. More info in the ticket: 146646 - const auto buffer_in_idx = buffer_expr->get_input_count() - 1; - const auto& parent_port = buffer_expr->get_input_port_connector(buffer_in_idx)->get_source(); - const auto& parent_loop_ids = get_parent_inner_loops(parent_port.get_expr()->get_loop_ids(), buffer_expr->get_loop_ids()); - const auto planar_shape = utils::get_preordered_vdims(parent_port); - - const size_t rank = allocation_rank >= 0 ? std::min(static_cast(allocation_rank), planar_shape.size()) - : planar_shape.size(); - - const auto& subtensor = ov::snippets::utils::get_projected_subtensor(parent_port); - - auto hard_equal = [&parent_port](const LoopPort& port) { - return *port.expr_port == parent_port; - }; - auto soft_equal = [&](const LoopPort& loop_port) { - const auto& port = *loop_port.expr_port; - // Check semantic of LoopPort - if (parent_port.get_index() != port.get_index() || - port.get_expr()->get_node()->get_type_info() != parent_port.get_expr()->get_node()->get_type_info()) - return false; - // Check that this LoopPort is connected to the same by semantic Buffer - const auto consumers = port.get_connected_ports(); - for (const auto& consumer : consumers) { - if (const auto buffer_consumer = ov::as_type_ptr(consumer.get_expr()->get_node())) { - if (buffer_consumer->get_cluster_id() == current_buffer->get_cluster_id() && consumer.get_index() == buffer_in_idx) - return true; - } - } - return false; - }; - - size_t allocation_size = 1; - std::set processed_dim_idxs; - for (const auto& parent_loop : parent_loop_ids) { - const auto loop_info = loop_manager->get_loop_info(parent_loop); - const auto& output_ports = loop_info->get_output_ports(); - auto it = std::find_if(output_ports.begin(), output_ports.end(), hard_equal); - // [149219] : Try to find original loop port if this LoopInfo is cloned after InsertSpecificIterations - // and ports are not mapped on the original ExpressionPorts - if (it == output_ports.end()) { - it = std::find_if(output_ports.begin(), output_ports.end(), soft_equal); - OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found"); - } - const auto& loop_port = *it; - const auto& dim_idx = loop_port.dim_idx; - if (loop_port.is_incremented && dim_idx < rank) { - if (const auto& unified_loop_info = ov::as_type_ptr(loop_info)) - allocation_size = utils::dynamic_safe_mul(allocation_size, unified_loop_info->get_work_amount()); - else if (const auto& expanded_loop_info = ov::as_type_ptr(loop_info)) - allocation_size = utils::dynamic_safe_mul(allocation_size, expanded_loop_info->get_unified_loop_info()->get_work_amount()); - else - OPENVINO_THROW("Unknown LoopInfo type"); - processed_dim_idxs.insert(dim_idx); - } - } - const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size(); - for (size_t i = 0; i < std::min(processing_rank, rank); ++i) { - if (processed_dim_idxs.count(i) == 0) { - const auto multiplier = i < subtensor.size() ? *(subtensor.rbegin() + i) : *(planar_shape.rbegin() + i); - allocation_size = utils::dynamic_safe_mul(allocation_size, multiplier); - } - } - - // Corner case when the current information is not enough - if (processing_rank == 0 && processed_dim_idxs.empty()) { - for (size_t i = 0; i < rank; ++i) { - allocation_size = utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i)); - } - } - - return allocation_size; -} - bool ComputeBufferAllocationSize::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ComputeBufferAllocationSize") + const auto& allocation_rank = linear_ir.get_config().m_loop_depth; const auto& loop_manager = linear_ir.get_loop_manager(); - - const auto& buffer_expressions = linear_ir.get_buffers(); - for (const auto& buffer_expr : buffer_expressions) { - const auto node = buffer_expr->get_node(); - if (const auto buffer = ov::as_type_ptr(node)) { - // If the current size is undefined, update it - // TODO [143395] : MemoryManager will return container with only dynamic buffers without any `is_defined()` - if (!buffer->is_defined()) - buffer->set_allocation_size(get_allocation_size(loop_manager, buffer_expr, m_buffer_allocation_rank)); - } else { - OPENVINO_ASSERT(ov::is_type(node), "Expected Buffer ops in Buffer expressions of LinearIR"); - } + for (const auto& buffer_expr : linear_ir.get_buffers()) { + // If the current size is undefined, update it + // TODO [143395] : MemoryManager will return container with only dynamic buffers without any `is_defined()` + if (!buffer_expr->is_defined()) + buffer_expr->init_allocation_size(loop_manager, allocation_rank); } return true; diff --git a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp index f3e065173baf9d..c43b5d63a358c6 100644 --- a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp +++ b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp @@ -16,17 +16,16 @@ namespace pass { using ShiftPtrParams = SetBufferRegGroup::ShiftPtrParams; -DefineBufferClusters::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const ExpressionPtr& target) { +DefineBufferClusters::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const BufferExpressionPtr& target) { return std::find_if(m_clusters.begin(), m_clusters.end(), [&target](const BufferCluster& cluster) { return cluster.count(target) > 0; }); } -bool DefineBufferClusters::is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const { - const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - return buffer && buffer_expr->get_loop_ids() == target_expr->get_loop_ids(); +bool DefineBufferClusters::is_direct_buffer(const BufferExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const { + return buffer_expr && buffer_expr->get_loop_ids() == target_expr->get_loop_ids(); } -void DefineBufferClusters::create_new_cluster(const ExpressionPtr& buffer_expr) { +void DefineBufferClusters::create_new_cluster(const BufferExpressionPtr& buffer_expr) { const auto cluster_it = find_cluster_by_expr(buffer_expr); // If Buffer is missed in clusters, create new cluster with the single Buffer node inside if (cluster_it == m_clusters.cend()) { @@ -36,9 +35,8 @@ void DefineBufferClusters::create_new_cluster(const ExpressionPtr& buffer_expr) size_t DefineBufferClusters::get_cluster_buffer_id(const BufferCluster& cluster) const { OPENVINO_ASSERT(!cluster.empty(), "Buffer cluster is empty!"); - const auto id = (ov::as_type_ptr(cluster.cbegin()->get()->get_node()))->get_reg_group(); - if (std::all_of(cluster.cbegin(), cluster.cend(), - [&id](const ExpressionPtr& expr) { return (ov::as_type_ptr(expr->get_node()))->get_reg_group() == id; })) { + const auto id = cluster.cbegin()->get()->get_reg_group(); + if (std::all_of(cluster.cbegin(), cluster.cend(), [&id](const BufferExpressionPtr& expr) { return expr->get_reg_group() == id; })) { return id; } return SIZE_MAX; @@ -53,7 +51,7 @@ DefineBufferClusters::BufferPorts DefineBufferClusters::get_input_buffers(const // Input Buffers for (size_t i = 0; i < in_count; ++i) { - const auto source_expr = connectors[i]->get_source().get_expr(); + const auto& source_expr = ov::as_type_ptr(connectors[i]->get_source().get_expr()); if (!is_direct_buffer(source_expr, loop_expr)) continue; // Save as input Buffer @@ -74,7 +72,7 @@ DefineBufferClusters::BufferPorts DefineBufferClusters::get_output_buffers(const for (size_t i = in_count; i < in_count + out_count; ++i) { for (const auto& consumer : connectors[i]->get_consumers()) { - auto consumer_expr = consumer.get_expr(); + const auto& consumer_expr = ov::as_type_ptr(consumer.get_expr()); if (!is_direct_buffer(consumer_expr, loop_expr)) continue; // Save as output Buffer @@ -102,7 +100,6 @@ void DefineBufferClusters::parse_loop(const LinearIR::constExprIt& expr_it) { for (const auto& out : output_buffers) { const auto output_buffer_expr = out.first; const auto output_buffer_port_idx = *(out.second.cbegin()); // Output port is always one - const auto output_buffer = ov::as_type_ptr(output_buffer_expr->get_node()); bool has_been_added = false; for (const auto& in : input_buffers) { @@ -110,17 +107,15 @@ void DefineBufferClusters::parse_loop(const LinearIR::constExprIt& expr_it) { if (visited_buffers.count(input_buffer_expr) > 0) continue; - const auto input_buffer = ov::as_type_ptr(input_buffer_expr->get_node()); - // If allocated sizes of buffers are unkown on compilation stage (dynamic), // we cannot be sure that they're will be the same in runtime. - if (!input_buffer->is_defined()|| !output_buffer->is_defined()) + if (!input_buffer_expr->is_defined()|| !output_buffer_expr->is_defined()) continue; // Memory can be reused if reading and writing are executed proportionally: // - the same reading/writing order // - the same buffer memory sizes - if ((input_buffer->get_byte_size() != output_buffer->get_byte_size()) || + if ((input_buffer_expr->get_byte_size() != output_buffer_expr->get_byte_size()) || (input_buffer_expr->get_output_port_descriptor(0)->get_layout() != output_buffer_expr->get_input_port_descriptor(0)->get_layout())) continue; @@ -184,13 +179,13 @@ void DefineBufferClusters::parse_nested_loops(const BufferPorts& input_buffers, for (auto it = std::reverse_iterator(outer_loop_end_expr_it); (*it)->get_node() != outer_loop_begin; ++it) { const auto& inner_expr = *it; - if (const auto inner_buffer = ov::as_type_ptr(inner_expr->get_node())) { - const auto inner_cluster_it = find_cluster_by_expr(inner_expr); + if (const auto inner_buffer_expr = ov::as_type_ptr(inner_expr)) { + const auto inner_cluster_it = find_cluster_by_expr(inner_buffer_expr); OPENVINO_ASSERT(inner_cluster_it != m_clusters.cend(), "Buffer cluster has not been found"); const auto inner_cluster_id = get_cluster_buffer_id(*inner_cluster_it); if (inner_cluster_id == SIZE_MAX) continue; - const auto final_offset = get_buffer_finalization_offset(inner_expr); + const auto final_offset = get_buffer_finalization_offset(inner_buffer_expr); auto unite = [&](const BufferPorts& ports, const bool is_input) { bool applied = false; @@ -200,13 +195,13 @@ void DefineBufferClusters::parse_nested_loops(const BufferPorts& input_buffers, // If the buffers are already in the same cluster or have different Buffer ID - skip if (cluster_it == inner_cluster_it) continue; // Buffer from one cluster must be only defined (with known allocation_size) or dynamic (with unknown allocation_size) - if (inner_buffer->is_defined() != ov::as_type_ptr(port.first->get_node())->is_defined()) continue; + if (inner_buffer_expr->is_defined() != port.first->is_defined()) continue; bool can_be_reused = true; for (const auto idx : port.second) { can_be_reused = can_be_reused && can_be_data_ptr_proportionally_shifted(outer_ptr_increments[idx], outer_data_sizes[idx], - final_offset, inner_buffer->get_element_type().size()); + final_offset, inner_buffer_expr->get_node()->get_element_type().size()); } if (!can_be_reused) continue; @@ -223,7 +218,7 @@ void DefineBufferClusters::parse_nested_loops(const BufferPorts& input_buffers, } } -int64_t DefineBufferClusters::get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const { +int64_t DefineBufferClusters::get_buffer_finalization_offset(const BufferExpressionPtr& buffer_expr) const { auto index = [](const std::vector& loop_inputs, const PortConnectorPtr& buffer_out) { const auto it = std::find(loop_inputs.cbegin(), loop_inputs.cend(), buffer_out); OPENVINO_ASSERT(it != loop_inputs.cend(), "Buffer output PortConnector has not been found in target LoopEnd inputs"); @@ -252,7 +247,7 @@ int64_t DefineBufferClusters::get_buffer_finalization_offset(const ExpressionPtr bool DefineBufferClusters::unite_nested_clusters(const BufferClusters::iterator& inner_cluster_it, BufferCluster& outer_cluster, - const ExpressionPtr& outer_buffer, bool is_outer_up) { + const BufferExpressionPtr& outer_buffer, bool is_outer_up) { for (const auto& inner_buffer : *inner_cluster_it) { ExpressionPtr common_loop_end_expr = nullptr; size_t outer_idx = SIZE_MAX, inner_idx = SIZE_MAX; @@ -267,9 +262,8 @@ bool DefineBufferClusters::unite_nested_clusters(const BufferClusters::iterator& const auto& inner_data_sizes = common_loop_end->get_element_type_sizes(); if (SetBufferRegGroup::can_be_in_one_group({ inner_data_sizes[up_idx], inner_ptr_increments[up_idx], inner_final_offsets[up_idx] }, { inner_data_sizes[down_idx], inner_ptr_increments[down_idx], inner_final_offsets[down_idx] })) { - const auto buffer_reg_group = ov::as_type_ptr(outer_buffer->get_node())->get_reg_group(); for (const auto& inner_buffer : *inner_cluster_it) - ov::as_type_ptr(inner_buffer->get_node())->set_reg_group(buffer_reg_group); + inner_buffer->set_reg_group(outer_buffer->get_reg_group()); outer_cluster.insert(inner_cluster_it->cbegin(), inner_cluster_it->cend()); m_clusters.erase(inner_cluster_it); @@ -280,7 +274,8 @@ bool DefineBufferClusters::unite_nested_clusters(const BufferClusters::iterator& return false; } -bool DefineBufferClusters::are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx) { +bool DefineBufferClusters::are_buffer_neighbours(const BufferExpressionPtr& up, const BufferExpressionPtr& down, ExpressionPtr& loop, + size_t& up_idx, size_t& down_idx) { auto find_input = [&down](const PortConnectorPtr& in) { return in->get_source().get_expr() == down; }; @@ -323,15 +318,15 @@ void DefineBufferClusters::parse_memory_access_op(const ExpressionPtr& expr) { // TODO: Some full MemoryAccess ops can have inplace inputs and outputs in general. // Need to add mechanism of inplace ports using MemoryAccess::PortDescriptor::inplace for (const auto& input : expr->get_input_port_connectors()) { - if (is_direct_buffer(input->get_source().get_expr(), expr)) { - create_new_cluster(input->get_source().get_expr()); - } + const auto& buffer_expr = ov::as_type_ptr(input->get_source().get_expr()); + if (is_direct_buffer(buffer_expr, expr)) + create_new_cluster(buffer_expr); } for (const auto& output : expr->get_output_port_connectors()) { for (const auto& consumer : output->get_consumers()) { - if (is_direct_buffer(consumer.get_expr(), expr)) { - create_new_cluster(consumer.get_expr()); - } + const auto& buffer_expr = ov::as_type_ptr(consumer.get_expr()); + if (is_direct_buffer(buffer_expr, expr)) + create_new_cluster(buffer_expr); } } } @@ -357,10 +352,8 @@ bool DefineBufferClusters::run(lowered::LinearIR& linear_ir, lowered::LinearIR:: for (size_t cluster_id = 0; cluster_id < m_clusters.size(); ++cluster_id) { const auto& cluster = m_clusters[cluster_id]; - std::for_each(cluster.cbegin(), cluster.cend(), [&cluster_id](const ExpressionPtr& buffer_expr) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - buffer->set_cluster_id(cluster_id); + std::for_each(cluster.cbegin(), cluster.cend(), [&cluster_id](const BufferExpressionPtr& buffer_expr) { + buffer_expr->set_cluster_id(cluster_id); }); } diff --git a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp index e48f833380e5e3..90a7ddf0b3d21c 100644 --- a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp +++ b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp @@ -18,21 +18,17 @@ bool InitBuffersDefault::run(lowered::LinearIR& linear_ir, lowered::LinearIR::co size_t idx = 0; size_t offset = 0; - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto& expr = *expr_it; - const auto op = expr->get_node(); - if (const auto buffer = ov::as_type_ptr(op)) { - buffer->set_reg_group(idx); - buffer->set_cluster_id(idx); - - if (!buffer->is_defined()) { - buffer->set_offset(utils::get_dynamic_value()); - } else { - buffer->set_offset(offset); - offset += buffer->get_byte_size(); - } - idx++; + for (const auto& buffer_expr : linear_ir.get_buffers()) { + buffer_expr->set_reg_group(idx); + buffer_expr->set_cluster_id(idx); + + if (!buffer_expr->is_defined()) { + buffer_expr->set_offset(utils::get_dynamic_value()); + } else { + buffer_expr->set_offset(offset); + offset += buffer_expr->get_byte_size(); } + idx++; } m_buffer_scratchpad_size = offset; diff --git a/src/common/snippets/src/lowered/pass/init_loops.cpp b/src/common/snippets/src/lowered/pass/init_loops.cpp index 8e9b62d8fab825..9e8873ac6c7fe2 100644 --- a/src/common/snippets/src/lowered/pass/init_loops.cpp +++ b/src/common/snippets/src/lowered/pass/init_loops.cpp @@ -29,11 +29,11 @@ inline void init_is_incremented(LoopPort& port, size_t loop_id) { // Note: LoopPort connected to Buffer between two loops should not be incremented in the outermost loop // Consider the example below: // Store; Loop ids [0,1,2,3] - // IntermediateMemoryBuffer; Loop ids [0,1] + // Buffer; Loop ids [0,1] // Load; Loop ids [0,1,4,5] // Store is output port of Loop-1, but it should be incremented only in Loop-2 and Loop-3. Similar with Load. auto is_ignored = [=](const ExpressionPtr& target_expr) { - if (ov::is_type(target_expr->get_node())) { + if (ov::is_type(target_expr)) { const auto& target_loops = target_expr->get_loop_ids(); const auto i_max = std::min(expr_loops.size(), target_loops.size()); for (size_t i = 0; i < i_max && expr_loops[i] == target_loops[i]; i++) { diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index c6b5c3960e025b..fabb6573ab3b14 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -115,7 +115,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir, // Current expr Loop identifies: 3, 4, 6 // Need to insert between 2nd and 4th Loops - after 2nd Loop const auto pos = insertion_position(linear_ir, loop_manager, parent_expr, expr); - const auto buffer = std::make_shared(parent->output(parent_port)); + const auto buffer = std::make_shared(parent->output(parent_port)); const auto buffer_consumer = has_shape_infer_parent ? top_shape_infer_expr->get_input_port(0) : *entry_port; linear_ir.insert_node(buffer, std::vector{ parent_expr_output }, buffer_loop_ids, false, pos, { buffer_consumer }); } @@ -191,7 +191,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir, // Note: All potential consumers must have the same count of first equal Loop identifies and the same count of different last identifies const auto pos = insertion_position(linear_ir, loop_manager, expr, consumer_expr); - auto buffer = std::make_shared(node->output(port_idx)); + auto buffer = std::make_shared(node->output(port_idx)); // We cannot insert Node output connector on Buffer output because not all consumers of Node needs Buffer // Example: // Add diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp index 231c783849908d..1885738eeb04b3 100644 --- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp @@ -76,9 +76,9 @@ bool InsertLoadStore::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt be modified |= insert_load(linear_ir, expr_it); } else if (ov::is_type(node)) { modified |= insert_store(linear_ir, expr_it); - } else if (ov::is_type(node)) { + } else if (ov::is_type(expr)) { modified |= insert_load(linear_ir, expr_it); - if (ov::is_type(node)) + if (expr->get_input_count() > 0) modified |= insert_store(linear_ir, expr_it); } } diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp index badf4b0477759c..1e99f8c845161f 100644 --- a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp +++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp @@ -32,15 +32,19 @@ void connect_cloned_body_with_buffers_outside(LinearIR::constExprIt cur_begin, L const auto& consumers = original_expr->get_output_port_connector(i)->get_consumers(); for (const auto& consumer : consumers) { const auto consumer_expr = consumer.get_expr(); - const auto buffer = ov::as_type_ptr(consumer_expr->get_node()); - if (buffer && std::find(cur_begin, cur_end, consumer.get_expr()) == cur_end) { - OutputVector new_inputs = {result_expr->get_node()->output(i)}; - for (const auto& input : consumer_expr->get_input_port_connectors()) { - const auto& source = input->get_source(); - new_inputs.push_back(source.get_expr()->get_node()->output(source.get_index())); + const auto buffer_expr = ov::as_type_ptr(consumer_expr); + if (buffer_expr && std::find(cur_begin, cur_end, consumer.get_expr()) == cur_end) { + std::vector new_descs = {buffer_expr->get_input_port_descriptor(consumer.get_index())->clone()}; + std::vector new_inputs = {result_expr->get_output_port_connector(i)}; + OutputVector new_op_inputs = {result_expr->get_node()->output(i)}; + for (size_t j = 0; j < buffer_expr->get_input_count(); ++j) { + const auto& source = buffer_expr->get_input_port_connector(j)->get_source(); + new_op_inputs.push_back(source.get_expr()->get_node()->output(source.get_index())); + new_descs.push_back(buffer_expr->get_input_port_descriptor(j)->clone()); + new_inputs.push_back(buffer_expr->get_input_port_connector(j)); } - const auto new_buffer = buffer->clone_with_new_inputs(new_inputs); - linear_ir.replace_with_node({consumer_expr}, new_buffer); + const auto new_buffer_op = buffer_expr->get_node()->clone_with_new_inputs(new_op_inputs); + linear_ir.replace_with_expr({consumer_expr}, buffer_expr->clone_with_new_inputs(new_buffer_op, new_inputs, new_descs)); break; } } diff --git a/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp b/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp index 3e235749ce7ca2..3431a198f90dc6 100644 --- a/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp +++ b/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp @@ -18,17 +18,13 @@ bool NormalizeBufferRegisterGroups::run(lowered::LinearIR& linear_ir, lowered::L // [ original Buffer reg group -> normalized ] std::map buffer_reg_groups; - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto& expr = *expr_it; - const auto op = expr->get_node(); - if (const auto buffer = ov::as_type_ptr(op)) { - const auto group = buffer->get_reg_group(); - if (buffer_reg_groups.count(group) == 0) { - const auto new_id = buffer_reg_groups.size(); - buffer_reg_groups[group] = new_id; - } - buffer->set_reg_group(buffer_reg_groups[group]); + for (const auto& buffer_expr : linear_ir.get_buffers()) { + const auto group = buffer_expr->get_reg_group(); + if (buffer_reg_groups.count(group) == 0) { + const auto new_id = buffer_reg_groups.size(); + buffer_reg_groups[group] = new_id; } + buffer_expr->set_reg_group(buffer_reg_groups[group]); } return buffer_reg_groups.size(); } diff --git a/src/common/snippets/src/lowered/pass/propagate_buffer_offset.cpp b/src/common/snippets/src/lowered/pass/propagate_buffer_offset.cpp index abab05700c2344..4e7d17cf284f89 100644 --- a/src/common/snippets/src/lowered/pass/propagate_buffer_offset.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_buffer_offset.cpp @@ -17,28 +17,24 @@ namespace lowered { namespace pass { -void PropagateBufferOffset::propagate(const ExpressionPtr& buffer_expr) { +void PropagateBufferOffset::propagate(const BufferExpressionPtr& buffer_expr) { // If Buffer has offset We set this offset in the connected MemoryAccess ops // to correctly read and write data because all Buffers have the common data pointer on buffer scratchpad - const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Failed to propagate Buffer offset: PropagateBufferOffset expects Buffer op"); - const auto offset = buffer->get_offset(); + const auto offset = buffer_expr->get_offset(); // Propagate to up: in Store. Buffer can have only one Store - if (ov::is_type(buffer)) { - for (const auto& input : buffer_expr->get_input_port_connectors()) { - const auto& parent_output = input->get_source(); - const auto& parent_expr = parent_output.get_expr(); - const auto port = parent_output.get_index(); - const auto& parent_node = parent_expr->get_node(); - auto memory_access = std::dynamic_pointer_cast(parent_node); - if (memory_access && memory_access->is_memory_access_output_port(port)) { - memory_access->set_output_offset(offset, port); - } else { - OPENVINO_THROW( - "PropagateBufferOffset didn't find the connected MemoryAccess op to Buffer for offset propagation"); - } + for (const auto& input : buffer_expr->get_input_port_connectors()) { + const auto& parent_output = input->get_source(); + const auto& parent_expr = parent_output.get_expr(); + const auto port = parent_output.get_index(); + const auto& parent_node = parent_expr->get_node(); + auto memory_access = std::dynamic_pointer_cast(parent_node); + if (memory_access && memory_access->is_memory_access_output_port(port)) { + memory_access->set_output_offset(offset, port); + } else { + OPENVINO_THROW( + "PropagateBufferOffset didn't find the connected MemoryAccess op to Buffer for offset propagation"); } } // Propagate to down: in Load. Buffer can have several Load @@ -65,10 +61,8 @@ void PropagateBufferOffset::propagate(const ExpressionPtr& buffer_expr) { bool PropagateBufferOffset::run(lowered::LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::PropagateBufferOffset"); - const auto& buffer_expressions = linear_ir.get_buffers(); - for (const auto& buffer_expr : buffer_expressions) { + for (const auto& buffer_expr : linear_ir.get_buffers()) propagate(buffer_expr); - } return true; } diff --git a/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp index 59c9bf21a0894a..76ece34e844618 100644 --- a/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp +++ b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp @@ -28,7 +28,7 @@ bool operator!=(const SetBufferRegGroup::ShiftPtrParams& lhs, const SetBufferReg return !(rhs == lhs); } -size_t SetBufferRegGroup::get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool) { +size_t SetBufferRegGroup::get_buffer_idx(const BufferExpressionPtr& target, const BufferPool& pool) { const auto iter = std::find(pool.cbegin(), pool.cend(), target); OPENVINO_ASSERT(iter != pool.cend(), "Buffer wasn't find in Buffer system of Subgraph"); return std::distance(pool.cbegin(), iter); @@ -44,8 +44,8 @@ bool SetBufferRegGroup::can_be_in_one_group(const ShiftPtrParams& lhs, const Shi return are_static && equal_ptr_params_shifting && (equal_element_type_sizes || (lhs.ptr_increment == 0 && lhs.finalization_offset == 0)); } -bool SetBufferRegGroup::are_adjacent(const std::pair& lhs, - const std::pair& rhs) { +bool SetBufferRegGroup::are_adjacent(const std::pair& lhs, + const std::pair& rhs) { const auto& lhs_ids = lhs.first->get_loop_ids(); const auto& rhs_ids = rhs.first->get_loop_ids(); const auto equal_loop_ids = lhs_ids == rhs_ids; @@ -64,10 +64,10 @@ bool SetBufferRegGroup::are_adjacent(const std::pair& lhs, - const std::pair& rhs, - const BufferPool& buffers, - std::vector& adj) { +void SetBufferRegGroup::update_adj_matrix(const std::pair& lhs, + const std::pair& rhs, + const BufferPool& buffers, + std::vector& adj) { const auto size = buffers.size(); const auto lhs_idx = get_buffer_idx(lhs.first, buffers); const auto rhs_idx = get_buffer_idx(rhs.first, buffers); @@ -125,14 +125,14 @@ SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_neighbours(const BufferMap buffer_neighbours; for (size_t i = 0; i < input_count; ++i) { const auto& parent_output = loop_end_expr->get_input_port_connector(i)->get_source().get_expr(); - if (ov::is_type(parent_output->get_node())) { - if (buffer_neighbours.count(parent_output) > 0) { - OPENVINO_ASSERT(buffer_neighbours[parent_output].ptr_increment == ptr_increments[i] && - buffer_neighbours[parent_output].finalization_offset == finalization_offsets[i], + if (const auto buffer_expr = ov::as_type_ptr(parent_output)) { + if (buffer_neighbours.count(buffer_expr) > 0) { + OPENVINO_ASSERT(buffer_neighbours[buffer_expr].ptr_increment == ptr_increments[i] && + buffer_neighbours[buffer_expr].finalization_offset == finalization_offsets[i], "Invalid data pointer shifts: If Buffer has several consumers, this consumers must have the same shifts or zero"); continue; } - buffer_neighbours[parent_output] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] }; + buffer_neighbours[buffer_expr] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] }; } } for (size_t i = input_count; i < input_count + output_count; ++i) { @@ -142,8 +142,8 @@ SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_neighbours(const size_t loop_count = 0; for (const auto& consumer_input : consumer_inputs) { const auto& child_expr = consumer_input.get_expr(); - if (ov::is_type(child_expr->get_node())) { - buffer_neighbours[child_expr] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] }; + if (const auto buffer_expr = ov::as_type_ptr(child_expr)) { + buffer_neighbours[buffer_expr] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] }; buffer_count++; } else if (ov::is_type(child_expr->get_node())) { loop_count++; @@ -163,34 +163,41 @@ SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_inside(const Lin BufferMap inner_buffers; for (auto it = std::reverse_iterator(loop_end_it); (*it)->get_node() != loop_begin; ++it) { const auto& inner_expr = *it; - if (ov::is_type(inner_expr->get_node())) { + if (const auto buffer_expr = ov::as_type_ptr(inner_expr)) { // Set default zero values since it's not used for adjacency definition in case with Buffers in Loop - if (inner_buffers.count(inner_expr) == 0) - inner_buffers[inner_expr] = { 0, 0, 0 }; + if (inner_buffers.count(buffer_expr) == 0) + inner_buffers[buffer_expr] = { 0, 0, 0 }; } } return inner_buffers; } auto SetBufferRegGroup::coloring(BufferPool& buffers, std::vector& adj) -> std::map { + auto get_buffer_it = [&](size_t index) { + OPENVINO_ASSERT(index < buffers.size(), "Incorrect index"); + BufferPool::iterator it = buffers.begin(); + std::advance(it, index); + return it; + }; size_t color = 0; std::map color_groups; const auto size = buffers.size(); for (size_t i = 0; i < size; i++) { + auto& buffer_i = *get_buffer_it(i); // The Buffer is already colored (visited) - skip - if (!buffers[i]) + if (!buffer_i) continue; - const auto& buffer = buffers[i]; - color_groups[color].push_back(buffer); // Add to Color Group - buffers[i] = nullptr; // Remove from graph vertices + color_groups[color].push_back(buffer_i); // Add to Color Group + buffer_i = nullptr; // Remove from graph vertices // While Buffer `i` has non-coloured non-neighbours (while row `i` contains 0) - while (!std::accumulate(adj.begin() + i * size, adj.begin() + (i + 1) * size, true, std::logical_and())) { + while ((i + 1 < size) && !std::accumulate(adj.begin() + i * size, adj.begin() + (i + 1) * size, true, std::logical_and())) { size_t j = i + 1; + auto buffer_j_it = get_buffer_it(j); // Find first non-adjacent and non-visited (non-colored) Buffer to color him to the same color - for (; j < size; ++j) { - if (!adj[index(size, i, j)] && buffers[j]) + for (; j < size; ++j, ++buffer_j_it) { + if (!adj[index(size, i, j)] && *buffer_j_it) break; } @@ -199,9 +206,10 @@ auto SetBufferRegGroup::coloring(BufferPool& buffers, std::vector& adj) -> if (j == size) break; - const auto& neighbour_buffer = buffers[j]; + auto& buffer_j = *buffer_j_it; + const auto& neighbour_buffer = buffer_j; color_groups[color].push_back(neighbour_buffer); // Add to Color Group - buffers[j] = nullptr; // Remove from graph vertices + buffer_j = nullptr; // Remove from graph vertices // Unite adjacency links: // All the neighbors of Buffer `j` are added to the neighbors of Buffer `i` (the `vertices` are pulled together). // The result is an updated i-th row of the adjacency matrix, @@ -220,14 +228,7 @@ auto SetBufferRegGroup::coloring(BufferPool& buffers, std::vector& adj) -> bool SetBufferRegGroup::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SetBufferRegGroup") // Identify Buffers using Graph coloring algorithm. - BufferPool buffer_pool; - - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto& expr = *expr_it; - if (ov::is_type(expr->get_node())) { - buffer_pool.push_back(expr); - } - } + BufferPool buffer_pool = linear_ir.get_buffers(); // Creation of Adj matrix auto adj = create_adjacency_matrix(begin, end, buffer_pool); @@ -238,9 +239,8 @@ bool SetBufferRegGroup::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt for (const auto& pair : color_groups) { const auto color = pair.first; const auto& united_buffers = pair.second; - for (const auto& buffer_expr : united_buffers) { - ov::as_type_ptr(buffer_expr->get_node())->set_reg_group(color); - } + for (const auto& buffer_expr : united_buffers) + buffer_expr->set_reg_group(color); } return true; diff --git a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp index 2a6b68738f7a68..ca85cefd369099 100644 --- a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp +++ b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp @@ -28,22 +28,17 @@ std::map create_execution_number_mapping(const LinearIR& linear_ir) } } // namespace -std::pair SolveBufferMemory::extract_static_and_dynamic_buffers(const LinearIR::container& buffer_expressions) { - LinearIR::container static_buffer_exprs, dynamic_buffer_exprs; +std::pair SolveBufferMemory::extract_static_and_dynamic_buffers(const Buffers& buffer_expressions) { + Buffers static_buffer_exprs, dynamic_buffer_exprs; for (const auto& buffer_expr : buffer_expressions) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - - auto& clusters = buffer->is_defined() ? static_buffer_exprs : dynamic_buffer_exprs; + auto& clusters = buffer_expr->is_defined() ? static_buffer_exprs : dynamic_buffer_exprs; clusters.push_back(buffer_expr); } // Validation check that buffer cluster has only static or dynamic buffers. for (const auto& static_buffer : static_buffer_exprs) { - const auto static_cluster_id = ov::as_type_ptr(static_buffer->get_node())->get_cluster_id(); - auto is_cluster_ids_the_same = [&static_cluster_id](const ExpressionPtr& expr) { - return static_cluster_id == ov::as_type_ptr(expr->get_node())->get_cluster_id(); - }; + const auto static_cluster_id = static_buffer->get_cluster_id(); + auto is_cluster_ids_the_same = [&static_cluster_id](const BufferExpressionPtr& expr) { return static_cluster_id == expr->get_cluster_id(); }; OPENVINO_ASSERT(std::none_of(dynamic_buffer_exprs.cbegin(), dynamic_buffer_exprs.cend(), is_cluster_ids_the_same), "There is Buffer cluster with buffers which has defined and undefined allocation sizes"); } @@ -51,7 +46,7 @@ std::pair SolveBufferMemory::extract_s return { static_buffer_exprs, dynamic_buffer_exprs }; } -std::vector SolveBufferMemory::init_boxes(const LinearIR::container& buffer_expressions, const LinearIR& linear_ir) { +std::vector SolveBufferMemory::init_boxes(const Buffers& buffer_expressions, const LinearIR& linear_ir) { // ov::MemorySolver interface requires integer execution numbers (lifetime must be integer). // To align with ov::MemorySolver interface, we create the map [double -> integer] const auto int_execution_numbers = create_execution_number_mapping(linear_ir); @@ -63,9 +58,7 @@ std::vector SolveBufferMemory::init_boxes(const LinearIR: std::map map_boxes; for (const auto& buffer_expr : buffer_expressions) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - auto cluster_id = static_cast(buffer->get_cluster_id()); + auto cluster_id = static_cast(buffer_expr->get_cluster_id()); if (map_boxes.count(cluster_id) == 0) { map_boxes[cluster_id] = { std::numeric_limits::max(), 0, 0, cluster_id }; @@ -98,7 +91,7 @@ std::vector SolveBufferMemory::init_boxes(const LinearIR: } OPENVINO_ASSERT(e_start <= e_finish, "Incorrect life time of buffer!"); - auto buffer_size = static_cast(buffer->get_byte_size()); + auto buffer_size = static_cast(buffer_expr->get_byte_size()); box.size = std::max(buffer_size, box.size); box.start = std::min(e_start, box.start); @@ -119,7 +112,7 @@ std::vector SolveBufferMemory::init_boxes(const LinearIR: return boxes; } -void SolveBufferMemory::solve_static_buffer_memory(const LinearIR::container& static_buffer_expressions, const LinearIR& linear_ir) { +void SolveBufferMemory::solve_static_buffer_memory(const Buffers& static_buffer_expressions, const LinearIR& linear_ir) { const auto boxes = init_boxes(static_buffer_expressions, linear_ir); ov::MemorySolver memSolver(boxes); @@ -127,37 +120,28 @@ void SolveBufferMemory::solve_static_buffer_memory(const LinearIR::container& st // Set offsets for Buffers for (const auto& buffer_expr : static_buffer_expressions) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - - const auto offset = static_cast(memSolver.get_offset(static_cast(buffer->get_cluster_id()))); - buffer->set_offset(offset * m_alignment); // alignment in byte + const auto offset = static_cast(memSolver.get_offset(static_cast(buffer_expr->get_cluster_id()))); + buffer_expr->set_offset(offset * m_alignment); // alignment in byte } } -void SolveBufferMemory::set_dynamic_buffer_offset(const LinearIR::container& dynamic_buffer_expressions) { +void SolveBufferMemory::set_dynamic_buffer_offset(const Buffers& dynamic_buffer_expressions) { size_t offset = utils::get_dynamic_value(); // If there are not allocated memory for static buffers in LinearIR and there is only one cluster of dynamic buffer exprs, // we can force offset = 0 if (m_static_buffer_scratchpad_size == 0) { std::set dynamic_clusters; - for (const auto& dynamic_buffer_expr : dynamic_buffer_expressions) { - const auto& buffer = ov::as_type_ptr(dynamic_buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - dynamic_clusters.insert(buffer->get_cluster_id()); - } + for (const auto& dynamic_buffer_expr : dynamic_buffer_expressions) + dynamic_clusters.insert(dynamic_buffer_expr->get_cluster_id()); + if (dynamic_clusters.size() == 1) offset = 0; } // Set offsets for Buffers - for (const auto& buffer_expr : dynamic_buffer_expressions) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - - buffer->set_offset(offset); - } + for (const auto& buffer_expr : dynamic_buffer_expressions) + buffer_expr->set_offset(offset); } bool SolveBufferMemory::run(LinearIR& linear_ir) { @@ -165,7 +149,7 @@ bool SolveBufferMemory::run(LinearIR& linear_ir) { // TODO [143395] : MemoryManager will be able to return two containers with dynamic and static buffers // without additional `extract` functions in all passes - LinearIR::container static_buffer_exprs, dynamic_buffer_exprs; + Buffers static_buffer_exprs, dynamic_buffer_exprs; std::tie(static_buffer_exprs, dynamic_buffer_exprs) = extract_static_and_dynamic_buffers(linear_ir.get_buffers()); if (!static_buffer_exprs.empty()) diff --git a/src/common/snippets/src/lowered/pass/validate.cpp b/src/common/snippets/src/lowered/pass/validate.cpp index 24fff8ab0fc00b..2e9e5813c03264 100644 --- a/src/common/snippets/src/lowered/pass/validate.cpp +++ b/src/common/snippets/src/lowered/pass/validate.cpp @@ -64,10 +64,12 @@ void validate_result(const ExpressionPtr& expr, const LinearIR& linear_ir) { void validate_buffer(const ExpressionPtr& expr, const LinearIR& linear_ir) { OPENVINO_ASSERT(ov::is_type(expr->get_node()), "Buffer validation expects Buffer op"); + OPENVINO_ASSERT(ov::is_type(expr), + "Buffer validation expects Buffer expression"); for (const auto& input : expr->get_input_port_connectors()) { const auto& source = input->get_source(); const auto ma = std::dynamic_pointer_cast(source.get_expr()->get_node()); - OPENVINO_ASSERT(ma && ma->is_memory_access_input_port(source.get_index()), + OPENVINO_ASSERT(ma && ma->is_memory_access_output_port(source.get_index()), "Buffer expects MemoryAccess parent"); const auto buffer_siblings = input->get_consumers(); for (const auto& buffer_sibling : buffer_siblings) { @@ -124,39 +126,6 @@ void validate_loop_end(const ExpressionPtr& expr, const LinearIR& linear_ir) { validate_loop_ports(input_port_infos); validate_loop_ports(output_port_infos, loop_end->get_input_num()); } - -// TODO [143395] : Extract this validation checks to the separate `ValidateBuffers` pass -void validate_buffer_expressions(const LinearIR::container& buffer_expressions) { - std::set cluster_ids; - std::map> dynamic_buffer_clusters, static_buffer_clusters; - - for (const auto& buffer_expr : buffer_expressions) { - const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Expected Buffer ops in Buffer expressions of LinearIR"); - - // TODO [143395] : MemoryManager should provide exact containers with needed buffers (static or dynamic) without any `is_defined()` - auto& clusters = buffer->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters; - clusters[buffer->get_cluster_id()].insert(buffer_expr); - cluster_ids.insert(buffer->get_cluster_id()); - } - - OPENVINO_ASSERT(cluster_ids.size() == dynamic_buffer_clusters.size() + static_buffer_clusters.size(), "Incorrect count of Buffer clusters"); - OPENVINO_ASSERT(cluster_ids.empty() || (*cluster_ids.cbegin() == 0 && *cluster_ids.crbegin() == (cluster_ids.size() - 1)), - "Incorrect indetifiers of Buffer clusters"); - - for (const auto& p : static_buffer_clusters) { - const auto& cluster_id = p.first; - const auto& cluster = p.second; - OPENVINO_ASSERT(dynamic_buffer_clusters.count(cluster_id) == 0, "Buffers from the same cluster must be only static or dynamic"); - - OPENVINO_ASSERT(cluster.size() > 0, "Incorrect size of buffer cluster"); - size_t cluster_offset = ov::as_type_ptr((*cluster.cbegin())->get_node())->get_offset(); - for (const auto& buffer_expr : cluster) { - OPENVINO_ASSERT(cluster_offset == ov::as_type_ptr(buffer_expr->get_node())->get_offset(), - "Static Buffers from the same cluster must have the same offset!"); - } - } -} } // namespace Validate::Validate() { @@ -188,8 +157,6 @@ bool Validate::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lo prev_exec_order = expr->get_exec_num(); } - validate_buffer_expressions(linear_ir.get_buffers()); - return false; } diff --git a/src/common/snippets/src/lowered/pass/validate_buffers.cpp b/src/common/snippets/src/lowered/pass/validate_buffers.cpp new file mode 100644 index 00000000000000..c5100f42333ede --- /dev/null +++ b/src/common/snippets/src/lowered/pass/validate_buffers.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/validate_buffers.hpp" + +#include "snippets/utils/utils.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +bool ValidateBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ValidateBuffers") + + const auto& lir_buffers = linear_ir.get_buffers(); + + // Firstly we check that all BufferExpression are in "get_buffers()" + for (const auto& expr : linear_ir) { + if (const auto& buffer_expr = ov::as_type_ptr(expr)) + OPENVINO_ASSERT(std::find(lir_buffers.cbegin(), lir_buffers.cend(), buffer_expr) != lir_buffers.cend(), + "All BufferExpressions must be in LinearIR.get_buffers()"); + } + + // Secondly we should validate `lir_buffers`: + // - execution order + // - clusters + + std::set cluster_ids; + std::map> dynamic_buffer_clusters, static_buffer_clusters; + + double prev_exec_order = -1 * std::numeric_limits::max(); + for (const auto& buffer_expr : lir_buffers) { + // TODO [143395] : MemoryManager should provide exact containers with needed buffers (static or dynamic) without any `is_defined()` + auto& clusters = buffer_expr->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters; + clusters[buffer_expr->get_cluster_id()].insert(buffer_expr); + cluster_ids.insert(buffer_expr->get_cluster_id()); + + OPENVINO_ASSERT(buffer_expr->get_exec_num() > prev_exec_order, "Invalid execution order of buffer expressions"); + prev_exec_order = buffer_expr->get_exec_num(); + buffer_expr->validate(); + } + + OPENVINO_ASSERT(cluster_ids.size() == dynamic_buffer_clusters.size() + static_buffer_clusters.size(), "Incorrect count of Buffer clusters"); + OPENVINO_ASSERT(cluster_ids.empty() || (*cluster_ids.cbegin() == 0 && *cluster_ids.crbegin() == (cluster_ids.size() - 1)), + "Incorrect indetifiers of Buffer clusters"); + + for (const auto& p : static_buffer_clusters) { + const auto& cluster_id = p.first; + const auto& cluster = p.second; + OPENVINO_ASSERT(dynamic_buffer_clusters.count(cluster_id) == 0, "Buffers from the same cluster must be only static or dynamic"); + + OPENVINO_ASSERT(cluster.size() > 0, "Incorrect size of buffer cluster"); + size_t cluster_offset = (*cluster.cbegin())->get_offset(); + for (const auto& buffer_expr : cluster) { + OPENVINO_ASSERT(cluster_offset == buffer_expr->get_offset(), "Static Buffers from the same cluster must have the same offset!"); + } + } + + return !lir_buffers.empty(); +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp index 0c7403cd56f6f5..0c13c12ee8c32d 100644 --- a/src/common/snippets/src/op/buffer.cpp +++ b/src/common/snippets/src/op/buffer.cpp @@ -13,89 +13,83 @@ namespace ov { namespace snippets { namespace op { -Buffer::Buffer(const OutputVector& arguments, size_t allocation_size, size_t reg_group, size_t cluster_id) - : Op(arguments), m_allocation_size(allocation_size), m_reg_group(reg_group), m_cluster_id(cluster_id), m_offset(0) { - constructor_validate_and_infer_types(); -} - -bool Buffer::visit_attributes(AttributeVisitor& visitor) { - INTERNAL_OP_SCOPE(Buffer_visit_attributes); - auto element_type = get_element_type(); - auto allocation_size = utils::value2str(m_allocation_size); - auto offset = utils::value2str(m_offset); - visitor.on_attribute("allocation_size", allocation_size); - visitor.on_attribute("offset", offset); - visitor.on_attribute("reg_group", m_reg_group); - visitor.on_attribute("cluster_id", m_cluster_id); - visitor.on_attribute("element_type", element_type); - return true; -} - -bool Buffer::is_defined() const { - return !utils::is_dynamic_value(m_allocation_size); -} - -size_t Buffer::get_byte_size() const { - if (is_defined()) - return m_allocation_size * get_element_type().size(); - return utils::get_dynamic_value(); -} +Buffer::Buffer(const ov::Output& arg) : Buffer(ov::OutputVector{arg}) {} -IntermediateMemoryBuffer::IntermediateMemoryBuffer(const OutputVector& arguments, size_t allocation_size, size_t reg_group, size_t cluster_id) - : Buffer(arguments, allocation_size, reg_group, cluster_id) { +Buffer::Buffer(const OutputVector& arguments) : Op(arguments), m_type(Type::IntermediateMemory) { constructor_validate_and_infer_types(); } -IntermediateMemoryBuffer::IntermediateMemoryBuffer(const ov::Output& arg, size_t allocation_size, size_t reg_group, size_t cluster_id) - : IntermediateMemoryBuffer(OutputVector{arg}, allocation_size, reg_group, cluster_id) {} - -void IntermediateMemoryBuffer::validate_and_infer_types() { - INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); - ov::PartialShape output_shape; - set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); -} - -std::shared_ptr IntermediateMemoryBuffer::clone_with_new_inputs(const OutputVector& new_args) const { - INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); - auto new_buffer = std::make_shared(new_args, m_allocation_size, m_reg_group, m_cluster_id); - new_buffer->set_offset(m_offset); - return new_buffer; +Buffer::Buffer(const ov::Shape& shape, ov::element::Type element_type) : Op(), m_type(Type::NewMemory), m_output_shape(shape), m_element_type(element_type) { + constructor_validate_and_infer_types(); } -NewMemoryBuffer::NewMemoryBuffer(const ov::Shape& shape, size_t reg_group, size_t cluster_id, ov::element::Type element_type) - : Buffer({}, ov::shape_size(shape), reg_group, cluster_id), m_output_shape(shape), m_element_type(element_type) { - constructor_validate_and_infer_types(); +bool Buffer::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(Buffer_visit_attributes); + auto shape = utils::pshape_to_vdims(get_output_partial_shape(0)); + auto etype = get_output_element_type(0); + visitor.on_attribute("shape", shape); + visitor.on_attribute("element_type", etype); + return true; } -void NewMemoryBuffer::validate_and_infer_types() { +void Buffer::validate_and_infer_types() { INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); - OPENVINO_ASSERT(get_input_size() == 0, "Buffer with new allocated memory mustn't have arguments!"); - set_output_type(0, m_element_type, m_output_shape); + if (m_type == Type::NewMemory) { + OPENVINO_ASSERT(get_input_size() == 0, "NewMemory Buffer mustn't have inputs"); + set_output_type(0, m_element_type, m_output_shape); + } else if (m_type == Type::IntermediateMemory) { + OPENVINO_ASSERT(get_input_size() != 0, "IntermediateMemory Buffer must have inputs"); + const auto inputs = input_values(); + const auto inshape = get_input_partial_shape(0); + const auto intype = get_input_element_type(0); + OPENVINO_ASSERT(std::all_of(inputs.cbegin() + 1, inputs.cend(), + [&](const ov::Output& in) { return in.get_partial_shape() == inshape && in.get_element_type() == intype; }), + "All inputs of Buffers must have the same shape and element type"); + set_output_type(0, intype, inshape); + } else { + OPENVINO_THROW("Unknown Buffer type"); + } } -std::shared_ptr NewMemoryBuffer::clone_with_new_inputs(const OutputVector& new_args) const { +std::shared_ptr Buffer::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); - check_new_args_count(this, new_args); - auto new_buffer = std::make_shared(m_output_shape, m_reg_group, m_cluster_id, m_element_type); - new_buffer->set_offset(m_offset); - return new_buffer; + if (m_type == Type::NewMemory) { + OPENVINO_ASSERT(new_args.empty(), "NewMemory Buffer mustn't have inputs"); + return std::make_shared(m_output_shape, m_element_type); + } else if (m_type == Type::IntermediateMemory) { + return std::make_shared(new_args); + } else { + OPENVINO_THROW("Unknown Buffer type"); + } } -void NewMemoryBuffer::set_element_type(ov::element::Type element_type) { - m_element_type = std::move(element_type); - // Apply the change - validate_and_infer_types(); +size_t Buffer::get_allocation_size() const { + if (m_type == Type::NewMemory) { + const auto pshape = get_output_partial_shape(0); + OPENVINO_ASSERT(pshape.is_static(), "If Buffer doesn't have source - output shape must be static"); + return ov::shape_size(pshape.get_shape()); + } + return utils::get_dynamic_value(); } -NewMemoryBuffer::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { - const auto& buffer = ov::as_type_ptr(n); - OPENVINO_ASSERT(buffer, "Got invalid node in NewMemoryBuffer::ShapeInfer"); - m_shape = buffer->get_shape(); +Buffer::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { + const auto& buffer = ov::as_type_ptr(n); + OPENVINO_ASSERT(buffer, "Got invalid node in Buffer::ShapeInfer"); + m_type = buffer->m_type; + OPENVINO_ASSERT(utils::one_of(m_type, Type::IntermediateMemory, Type::NewMemory), "Got invalid Buffer type"); + if (m_type == Type::NewMemory) + m_shape = buffer->m_output_shape; } -IShapeInferSnippets::Result NewMemoryBuffer::ShapeInfer::infer(const std::vector& input_shapes) { - OPENVINO_ASSERT(input_shapes.empty(), "NewMemoryBuffer shape inference mustn't have input shapes"); - return {{m_shape}, ShapeInferStatus::success}; +IShapeInferSnippets::Result Buffer::ShapeInfer::infer(const std::vector& input_shapes) { + if (m_type == Type::NewMemory) { + OPENVINO_ASSERT(input_shapes.empty(), "NewMemoryBuffer shape inference mustn't have input shapes"); + return {{m_shape}, ShapeInferStatus::success}; + } else if (m_type == Type::IntermediateMemory) { + OPENVINO_ASSERT(!input_shapes.empty(), "IntermediateMemoryBuffer shape inference must have input shapes"); + return {{input_shapes[0].get()}, ShapeInferStatus::success}; + } + OPENVINO_THROW("Uknown Buffer type!"); } } // namespace op diff --git a/src/common/snippets/src/op/serialization_node.cpp b/src/common/snippets/src/op/serialization_node.cpp index c136acea975a42..1718f770ad62d6 100644 --- a/src/common/snippets/src/op/serialization_node.cpp +++ b/src/common/snippets/src/op/serialization_node.cpp @@ -40,83 +40,7 @@ std::shared_ptr SerializationNode::clone_with_new_inputs(const OutputVecto } bool SerializationNode::visit_attributes(AttributeVisitor &visitor) { - auto is_planar_layout = [](const std::vector& layout) { - for (size_t i = 0; i < layout.size(); ++i) - if (layout[i] != i) return false; - return true; - }; - auto subtensor2str = [](const VectorDims& subtensor) { - std::stringstream ss; - for (size_t i = 0; i < subtensor.size(); ++i) { - const auto& v = subtensor[i]; - const auto v_str = utils::is_full_dim_value(v) ? "FULL_DIM" : - utils::is_dynamic_value(v) ? "?" : std::to_string(v); - const auto del = i < subtensor.size() - 1 ? ", " : ""; - ss << v_str << del; - } - return ss.str(); - }; - - std::vector in_regs, out_regs; - std::vector in_reg_types, out_reg_types; - std::vector> shapes; - std::vector> subtensors; - std::vector>> layouts; - for (size_t i = 0; i < m_expr->get_input_count(); i++) { - const auto& desc = m_expr->get_input_port_descriptor(i); - const auto& shape = desc->get_shape(); - if (!shape.empty()) - shapes.emplace_back("in_shape_" + std::to_string(i), ov::PartialShape(shape)); - - const auto& subtensor = desc->get_subtensor(); - if (!subtensor.empty()) - subtensors.emplace_back("in_subtensor_" + std::to_string(i), subtensor2str(subtensor)); - - const auto& layout = desc->get_layout(); - if (!layout.empty() && !is_planar_layout(layout)) - layouts.emplace_back("in_layout_" + std::to_string(i), layout); - - in_reg_types.emplace_back(regTypeToStr(desc->get_reg().type)); - in_regs.emplace_back(desc->get_reg().idx); - } - for (size_t i = 0; i < m_expr->get_output_count(); i++) { - const auto& desc = m_expr->get_output_port_descriptor(i); - const auto& shape = desc->get_shape(); - if (!shape.empty()) - shapes.emplace_back("out_shape_" + std::to_string(i), ov::PartialShape(shape)); - - const auto& subtensor = desc->get_subtensor(); - if (!subtensor.empty()) - subtensors.emplace_back("out_subtensor_" + std::to_string(i), subtensor2str(subtensor)); - - const auto& layout = desc->get_layout(); - if (!layout.empty() && !is_planar_layout(layout)) - layouts.emplace_back("out_layout_" + std::to_string(i), layout); - - out_reg_types.emplace_back(regTypeToStr(desc->get_reg().type)); - out_regs.emplace_back(desc->get_reg().idx); - } - - if (!in_regs.empty()) { - visitor.on_attribute("in_regs", in_regs); - visitor.on_attribute("in_reg_types", in_reg_types); - } - if (!out_regs.empty()) { - visitor.on_attribute("out_regs", out_regs); - visitor.on_attribute("out_reg_types", out_reg_types); - } - for (auto& s : shapes) - visitor.on_attribute(s.first, s.second); - for (auto& s : subtensors) - visitor.on_attribute(s.first, s.second); - for (auto& s : layouts) - visitor.on_attribute(s.first, s.second); - auto loop_ids = m_expr->get_loop_ids(); - visitor.on_attribute("loop_ids", loop_ids); - auto exec_num = m_expr->get_exec_num(); - visitor.on_attribute("execution_number", exec_num); - m_expr->get_node()->visit_attributes(visitor); - return true; + return m_expr->visit_attributes(visitor); } } // namespace op diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index cf9f6b3121782e..0690494220171a 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -44,6 +44,7 @@ #include "snippets/lowered/pass/optimize_domain.hpp" #include "snippets/lowered/pass/insert_perf_count.hpp" #include "snippets/lowered/pass/validate_shapes.hpp" +#include "snippets/lowered/pass/validate_buffers.hpp" #include "snippets/lowered/pass/validate.hpp" #include "snippets/lowered/pass/pass_config.hpp" #include "snippets/lowered/pass/reduce_decomposition.hpp" @@ -472,6 +473,7 @@ void Subgraph::control_flow_transformations(size_t min_parallel_work_amount, siz pipeline.register_pass(m_linear_ir->get_config().m_are_buffers_optimized); pipeline.register_pass(); pipeline.register_positioned_passes(lowered_backend_passes); + pipeline.register_pass(); // must be last pipeline.register_pass(); // must be last pipeline.run(*m_linear_ir); diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 552455b89f5529..a2b288eabde14e 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -154,17 +154,14 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir) } void RuntimeConfigurator::init_buffer_info(const lowered::LinearIRCPtr& linear_ir) { - std::map> dynamic_buffer_clusters, static_buffer_clusters; + std::map> dynamic_buffer_clusters, static_buffer_clusters; // All needed checks are in Validate pass const auto& buffer_expressions = linear_ir->get_buffers(); for (const auto& buffer_expr : buffer_expressions) { - const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Expected Buffer ops in Buffer expressions of LinearIR"); - // TODO [143395] : MemoryManager should provide exact containers with needed buffers (static or dynamic) without any `is_defined()` - auto& clusters = buffer->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters; - clusters[buffer->get_cluster_id()].insert(buffer_expr); + auto& clusters = buffer_expr->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters; + clusters[buffer_expr->get_cluster_id()].insert(buffer_expr); } const auto cluster_count = dynamic_buffer_clusters.size() + static_buffer_clusters.size(); @@ -176,7 +173,7 @@ void RuntimeConfigurator::init_buffer_info(const lowered::LinearIRCPtr& linear_i const auto& cluster = p.second; OPENVINO_ASSERT(cluster.size() > 0, "Incorrect size of buffer cluster"); - size_t cluster_offset = ov::as_type_ptr((*cluster.cbegin())->get_node())->get_offset(); + size_t cluster_offset = (*cluster.cbegin())->get_offset(); m_config->buffer_cluster_offsets[cluster_id] = cluster_offset; } @@ -269,7 +266,8 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRC // No need to calculate allocation size of Buffers which are in Loops with `work_amount = 0` - they won't be executed if (is_not_executed(buffer_expr)) continue; - const auto& allocation_size = lowered::pass::ComputeBufferAllocationSize::get_allocation_size(loop_manager, buffer_expr, m_config->tile_rank); + buffer_expr->init_allocation_size(loop_manager, m_config->tile_rank); + const auto& allocation_size = buffer_expr->get_allocation_size(); OPENVINO_ASSERT(!utils::is_dynamic_value(allocation_size), "Buffer scratchpad size must be defined!"); additional_size = std::max(allocation_size * buffer_expr->get_node()->get_element_type().size(), additional_size); } diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp index ff42dae602a54f..76a4c491c66983 100644 --- a/src/common/snippets/src/shape_inference/shape_inference.cpp +++ b/src/common/snippets/src/shape_inference/shape_inference.cpp @@ -39,7 +39,6 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry SHAPE_INFER_PREDEFINED(op::ConvertSaturation, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(op::Load, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(op::Store, PassThroughShapeInfer), - SHAPE_INFER_PREDEFINED(op::IntermediateMemoryBuffer, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(op::Fill, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(ov::op::v0::Parameter, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(ov::op::v1::LogicalNot, PassThroughShapeInfer), @@ -70,7 +69,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry SHAPE_INFER_OP_SPECIFIC(op::RankNormalization), SHAPE_INFER_OP_SPECIFIC(op::BroadcastLoad), SHAPE_INFER_OP_SPECIFIC(op::BroadcastMove), - SHAPE_INFER_OP_SPECIFIC(op::NewMemoryBuffer), + SHAPE_INFER_OP_SPECIFIC(op::Buffer), }; #undef SHAPE_INFER_OP_SPECIFIC_EXTERNAL #undef SHAPE_INFER_OP_SPECIFIC diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp index 4dc3f2dae7e867..ac521631917897 100644 --- a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp +++ b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp @@ -82,11 +82,9 @@ void BufferAllocationTest::ApplyTransformations(const std::shared_ptr reg_groups, clusters; - for (const auto& expr : m_linear_ir) { - if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - reg_groups.insert(buffer->get_reg_group()); - clusters.insert(buffer->get_cluster_id()); - } + for (const auto& buffer_expr : m_linear_ir.get_buffers()) { + reg_groups.insert(buffer_expr->get_reg_group()); + clusters.insert(buffer_expr->get_cluster_id()); } EXPECT_EQ(reg_groups.size(), m_expected_reg_group_count); EXPECT_EQ(clusters.size(), m_expected_cluster_count); @@ -100,9 +98,9 @@ std::shared_ptr EltwiseBufferAllocationTest::GetModel() const { const auto parameter0 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); const auto parameter1 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); const auto add = std::make_shared(parameter0, parameter1); - const auto buffer0 = std::make_shared(add); + const auto buffer0 = std::make_shared(add); const auto relu = std::make_shared(buffer0); - const auto buffer1 = std::make_shared(relu); + const auto buffer1 = std::make_shared(relu); const auto exp = std::make_shared(buffer1); const auto body = std::make_shared(std::make_shared(exp), ov::ParameterVector{parameter0, parameter1}); diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index 136dccb5fac667..e9ed04bf8da5a4 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -51,8 +51,7 @@ DummyTargetMachine::DummyTargetMachine(const std::vector& jitters[ov::snippets::op::PerfCountEnd::get_type_info_static()] = dummy_functor; #endif jitters[ov::snippets::op::Brgemm::get_type_info_static()] = dummy_functor; - jitters[ov::snippets::op::IntermediateMemoryBuffer::get_type_info_static()] = dummy_functor; - jitters[ov::snippets::op::NewMemoryBuffer::get_type_info_static()] = dummy_functor; + jitters[ov::snippets::op::Buffer::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::Fill::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::ReduceMax::get_type_info_static()] = dummy_functor; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp index 9345b79c37e710..806253f0cc2155 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp @@ -38,6 +38,7 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov jcp = *reinterpret_cast(kernel->compile_params); const auto& parameters = body->get_parameters(); const auto& results = body->get_results(); + const auto& buffers = body->get_buffers(); num_inputs = parameters.size(); num_outputs = results.size(); for (const auto& param : parameters) @@ -46,19 +47,20 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov mem_access_exprs.push_back(result); std::set unique_buffers; - for (const auto& expr : *body) { - if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - const auto buffer_id = buffer->get_cluster_id(); - if (unique_buffers.count(buffer_id) == 0) { - mem_access_exprs.push_back(expr); - unique_buffers.insert(buffer_id); - } - } else { - if (std::find(parameters.cbegin(), parameters.cend(), expr) == parameters.cend() && - std::find(results.cbegin(), results.cend(), expr) == results.cend()) - general_exprs.emplace_back(expr); + for (const auto& buffer_expr : buffers) { + const auto buffer_reg_group = buffer_expr->get_reg_group(); + if (unique_buffers.count(buffer_reg_group) == 0) { + mem_access_exprs.push_back(buffer_expr); + unique_buffers.insert(buffer_reg_group); } } + + for (const auto& expr : *body) { + if (std::find(parameters.cbegin(), parameters.cend(), expr) == parameters.cend() && + std::find(results.cbegin(), results.cend(), expr) == results.cend() && + std::find(buffers.cbegin(), buffers.cend(), expr) == buffers.cend()) + general_exprs.emplace_back(expr); + } num_unique_buffers = unique_buffers.size(); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index 01a87d849f9731..1da6cd7121487f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -159,8 +159,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho // data movement jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); - jitters[snippets::op::IntermediateMemoryBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); - jitters[snippets::op::NewMemoryBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); + jitters[snippets::op::Buffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::VectorBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp index ff38c5586af106..4c36aa3b21ab35 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp @@ -41,7 +41,7 @@ jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, "Jit emitter is called when the shapes are unknown"); auto get_cluster_id = [](const snippets::lowered::ExpressionPort& p) { // Note: NewMemoryBuffer is used as a scratchpad and can't be dynamic, so we don't need to account for them here - if (const auto buffer = ov::as_type_ptr(p.get_expr()->get_node())) + if (const auto buffer = ov::as_type_ptr(p.get_expr())) return buffer->get_cluster_id(); else return SIZE_MAX; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp index ff58ef8b0a5bcb..a86bf841c241da 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp @@ -23,6 +23,7 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov jcp = *reinterpret_cast(kernel->compile_params); const auto& parameters = body->get_parameters(); const auto& results = body->get_results(); + const auto& buffers = body->get_buffers(); num_inputs = parameters.size(); num_outputs = results.size(); for (const auto& param : parameters) @@ -31,19 +32,20 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov mem_access_exprs.push_back(result); std::set unique_buffers; - for (const auto& expr : *body) { - if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - const auto buffer_reg_group = buffer->get_reg_group(); - if (unique_buffers.count(buffer_reg_group) == 0) { - mem_access_exprs.push_back(expr); - unique_buffers.insert(buffer_reg_group); - } - } else { - if (std::find(parameters.cbegin(), parameters.cend(), expr) == parameters.cend() && - std::find(results.cbegin(), results.cend(), expr) == results.cend()) - general_exprs.emplace_back(expr); + for (const auto& buffer_expr : buffers) { + const auto buffer_reg_group = buffer_expr->get_reg_group(); + if (unique_buffers.count(buffer_reg_group) == 0) { + mem_access_exprs.push_back(buffer_expr); + unique_buffers.insert(buffer_reg_group); } } + + for (const auto& expr : *body) { + if (std::find(parameters.cbegin(), parameters.cend(), expr) == parameters.cend() && + std::find(results.cbegin(), results.cend(), expr) == results.cend() && + std::find(buffers.cbegin(), buffers.cend(), expr) == buffers.cend()) + general_exprs.emplace_back(expr); + } num_unique_buffers = unique_buffers.size(); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp index 1d8c26e3d709fa..f2fd978edc6aaf 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp @@ -60,7 +60,7 @@ size_t jit_memory_emitter::aux_gprs_count() const { size_t jit_memory_emitter::get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { OV_CPU_JIT_EMITTER_ASSERT(expr->get_input_port_connectors().size() == 1, "MemoryAccess must have one parent"); const auto& parent_expr = expr->get_input_port_connector(0)->get_source().get_expr(); - if (const auto buffer = ov::as_type_ptr(parent_expr->get_node())) { + if (const auto buffer = ov::as_type_ptr(parent_expr)) { return buffer->get_cluster_id(); } return SIZE_MAX; @@ -70,7 +70,7 @@ size_t jit_memory_emitter::get_consumer_buffer_cluster_id(const ov::snippets::lo OV_CPU_JIT_EMITTER_ASSERT(expr->get_output_port_connectors().size() == 1, "MemoryAccess must have one consumer"); const auto& consumers = expr->get_output_port_connector(0)->get_consumers(); for (const auto& consumer : consumers) - if (const auto buffer = ov::as_type_ptr(consumer.get_expr()->get_node())) + if (const auto buffer = ov::as_type_ptr(consumer.get_expr())) return buffer->get_cluster_id(); return SIZE_MAX; } diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index d5a8801ffedeac..5e43da6e2bfb86 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -161,12 +161,11 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::snippets::op::HorizonSum) \ OP_EXTENSION(ov::snippets::op::KernelStatic) \ OP_EXTENSION(ov::snippets::op::KernelDynamic) \ - OP_EXTENSION(ov::snippets::op::IntermediateMemoryBuffer) \ OP_EXTENSION(ov::snippets::op::Load) \ OP_EXTENSION(ov::snippets::op::LoadReshape) \ OP_EXTENSION(ov::snippets::op::LoopBegin) \ OP_EXTENSION(ov::snippets::op::LoopEnd) \ - OP_EXTENSION(ov::snippets::op::NewMemoryBuffer) \ + OP_EXTENSION(ov::snippets::op::Buffer) \ OP_EXTENSION(ov::snippets::op::Nop) \ OP_EXTENSION(ov::snippets::op::PowerStatic) \ OP_EXTENSION(ov::snippets::op::Scalar) \ diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index e166fc8bf453e7..8d04c41676b193 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -21,6 +21,7 @@ #include "snippets/lowered/pass/optimize_domain.hpp" #include "snippets/lowered/pass/insert_loops.hpp" #include "snippets/lowered/pass/mark_loops.hpp" +#include "snippets/lowered/pass/insert_buffers.hpp" #include "transformations/defs.hpp" #include "transformations/cpu_opset/common/pass/convert_to_swish_cpu.hpp" #include "transformations/snippets/common/pass/mul_add_to_fma.hpp" @@ -32,7 +33,7 @@ #else #include "emitters/snippets/x64/cpu_generator.hpp" #include "transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp" -#include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp" +#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp" #include "transformations/snippets/x64/pass/remove_converts.hpp" #include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp" #include "transformations/snippets/x64/pass/enforce_precision.hpp" @@ -682,8 +683,8 @@ Subgraph::ControlFlowPasses Subgraph::getControlFlowPasses() const { ov::intel_cpu::pass::FuseLoadStoreConvert); #if defined(OPENVINO_ARCH_X86_64) - SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::intel_cpu::pass::FuseLoadStoreConvert, - ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape); + SNIPPETS_REGISTER_PASS_RELATIVE(Place::Before, ov::snippets::lowered::pass::InsertBuffers, + ov::intel_cpu::pass::InsertBrgemmCopyBBuffers); #endif #ifdef SNIPPETS_LIBXSMM_TPP diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp index cc30edef38086f..dfe4441de90699 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp @@ -137,7 +137,7 @@ std::shared_ptr BrgemmCPU::get_brgemm_copy() const { if (const auto brgemm_copy_b = ov::as_type_ptr(b_input_node)) { return brgemm_copy_b; } - if (ov::is_type(b_input_node)) { + if (ov::is_type(b_input_node)) { if (const auto brgemm_copy_b = ov::as_type_ptr(b_input_node->get_input_node_shared_ptr(0))) { return brgemm_copy_b; } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index e5fac40ac09604..af70218ce0635f 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -77,46 +77,6 @@ size_t get_elems_in_vec(const ov::element::Type& precision) { } namespace repacking { -size_t get_repacking_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr) { - OPENVINO_ASSERT(ov::is_type(copy_b_expr->get_node())); - const auto& in_desc = copy_b_expr->get_input_port_descriptor(0); - const auto& in_layout = in_desc->get_layout(); - const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(copy_b_expr->get_input_port(0)); - - const size_t n_blk = *in_subtensor.rbegin(); - const size_t k_blk = *++in_subtensor.rbegin(); - OPENVINO_ASSERT(!is_dynamic_value(n_blk) && !is_dynamic_value(k_blk), "get_repacking_buffer_size must be called with static subtensor values"); - - const auto& precision = copy_b_expr->get_node()->get_input_element_type(0); - // Repacking buffer shape is set in accordance to OneDNN requirements - const size_t N_dim = std::max(n_blk, compute_inner_n_block(precision)); - if (!in_layout.empty() && in_layout.back() != in_layout.size() - 1) { - // In case of transpose, K dimension must be rounded-up to number of elems in vector register - // For the details, please see 'transpose16x8' and 'fixup16x16' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp - const auto elems_in_vec = brgemm_utils::get_elems_in_vec(precision); - return N_dim * rnd_up(k_blk, elems_in_vec); - } else { - // Low precision repacking writes the result by m_brgemmVNNIFactor * m_inner_n_block blocks - // despite the actual size of the input data. Because of that we have to round-up the allocation shape to always have enough memory allocated. - // For the details, please see 'copy_4x64' and 'copy_2x32' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp - const auto brgemmVNNIFactor = brgemm_utils::compute_vnni_factor(precision); - OPENVINO_ASSERT(brgemmVNNIFactor > 0, "brgemmVNNIFactor value must be positive."); - return N_dim * rnd_up(k_blk, brgemmVNNIFactor); - } -} - -size_t get_compensations_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr) { - OPENVINO_ASSERT(ov::is_type(copy_b_expr->get_node())); - const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(copy_b_expr->get_input_port(0)); - const size_t n_blk = *in_subtensor.rbegin(); - OPENVINO_ASSERT(!is_dynamic_value(n_blk), "get_compensations_buffer_size must be called with static subtensor values"); - const auto& precision = copy_b_expr->get_node()->get_input_element_type(0); - // Compensations are computed during repacking, so we need to round-up allocation shape according to m_inner_n_block - // because of OneDNN implementation nuances (as in get_repacking_buffer_size). - // However, the compensations are computed by N dimension, so K dimension doesn't affect the compensations buffer - return std::max(n_blk, compute_inner_n_block(precision)); -} - size_t compute_out_leading_dim(const size_t n_block, const ov::element::Type& precision) { return std::max(n_block, compute_inner_n_block(precision)); } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index 32d2264822ad57..d0360e45a62e18 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -42,18 +42,6 @@ size_t compute_vnni_factor(const ov::element::Type& precision); size_t get_elems_in_vec(const ov::element::Type& precision); namespace repacking { -/** - * @brief Computes buffer size that OneDNN impl needs for repacked tensor - * @param copy_b_expr Repacking expression whose information (tensor precision, layout, subtensors) is used for - * buffer size computations - */ -size_t get_repacking_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr); -/** - * @brief Computes buffer size that OneDNN impl needs for compensations - * @param copy_b_expr Repacking expression whose information (tensor precision, subtensors) is used for - * buffer size computations - */ -size_t get_compensations_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr); /** * @brief Computes leading dimension (LDB) which must be used in brgemm and brgemm_copy_b emitters * @param n_block N block size shared between BrgemmCPU and BrgemmCopyB node diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp index 3aff94fb7f20f9..6dda47e47326aa 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp @@ -88,7 +88,7 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { set_full_port_desc(output); if (with_amx(brgemm_type)) { - const auto scratch = std::make_shared(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); + const auto scratch = std::make_shared(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); brgemm_cpu = std::make_shared(brgemm->input_value(0), brgemm_repacking->output(0), scratch, brgemm_type, offset_a, offset_b, 0, offset_c, layout_a, std::vector{}, layout_c); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp index a5382f5afed53f..73da2a786fbee8 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp @@ -34,8 +34,8 @@ std::shared_ptr BrgemmCPUBlocking::DummyPass: LinearIR::constExprIt BrgemmCPUBlocking::move_new_memory_buffer(LinearIR& linear_ir, const LinearIR::constExprIt& brgemm_it) { const auto& brgemm_expr = brgemm_it->get(); const auto wsp_expr = brgemm_expr->get_input_port_connector(2)->get_source().get_expr(); - const auto wsp_buffer = ov::as_type_ptr(wsp_expr->get_node()); - OPENVINO_ASSERT(wsp_buffer, "Incorrect Scratchpad buffer for Brgemm AMX"); + const auto wsp_buffer = ov::as_type_ptr(wsp_expr); + OPENVINO_ASSERT(wsp_buffer && wsp_buffer->get_input_count() == 0, "Incorrect Scratchpad buffer for Brgemm AMX"); // If scratchpad with temp memory is not explicitly before Brgemm, need to move to there. if (wsp_expr != *std::prev(brgemm_it)) { const auto wsp_it = linear_ir.find(wsp_expr); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp new file mode 100644 index 00000000000000..bb209d6c282918 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "insert_brgemm_copy_b_buffers.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils/utils.hpp" +#include "snippets/itt.hpp" + +#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" +#include "utils/general_utils.h" + + +using namespace ov::intel_cpu::brgemm_utils::repacking; +using namespace ov::snippets::lowered; + +namespace ov { +namespace intel_cpu { +namespace pass { + +bool InsertBrgemmCopyBBuffers::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertBrgemmCopyBBuffers") + + const auto& factory = linear_ir.get_expr_factory(); + + auto insert_buffer = [&](const ExpressionPtr& copy_b_expr, size_t out_port, LinearIR::constExprIt insertion_pos) { + const auto& copy_b = ov::as_type_ptr(copy_b_expr->get_node()); + const auto& copy_b_out = copy_b_expr->get_output_port_connector(out_port); + const auto copy_b_consumers = copy_b_out->get_consumers(); + OPENVINO_ASSERT(copy_b_consumers.size() == 1, "BufferCopyB must have only one consumer on each out port - Brgemm"); + const auto& buffer_op = std::make_shared(copy_b->output(out_port)); + BufferExpressionPtr buffer_expr = nullptr; + if (out_port == 0) { + buffer_expr = factory->build(buffer_op, {copy_b_out}); + } else if (out_port == 1 && with_compensations(copy_b->get_type())) { + buffer_expr = factory->build(buffer_op, {copy_b_out}); + } else { + OPENVINO_THROW("BrgemmCopyB has incorrect output ports"); + } + return linear_ir.insert_expr(buffer_expr, LoopManager::get_common_outer_loops(copy_b_expr, copy_b_consumers.begin()->get_expr()), + true, insertion_pos, {copy_b_consumers}); + }; + + bool modified = false; + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto expr = *expr_it; + if (auto copy_b = ov::as_type_ptr(expr->get_node())) { + for (size_t i = 0; i < expr->get_output_count(); ++i) { + expr_it = insert_buffer(expr, i, std::next(expr_it)); + } + modified = true; + } + } + return modified; +} + +InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::RepackedWeightsBufferExpression(const RepackedWeightsBufferExpression& other) + : BufferExpression(other) {} + +InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::RepackedWeightsBufferExpression(const std::shared_ptr& n, + const std::shared_ptr& factory) : BufferExpression(n, factory) {} + +snippets::lowered::ExpressionPtr InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::clone() const { + return std::shared_ptr(new RepackedWeightsBufferExpression(*this)); +} + +void InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::validate() const { + BufferExpression::validate(); + OPENVINO_ASSERT(get_input_count() == 1, "RepackedWeightsBufferExpression must have only one input"); + const auto& parent_out = get_input_port_connector(0)->get_source(); + OPENVINO_ASSERT(ov::is_type(parent_out.get_expr()->get_node()) && parent_out.get_index() == 0, + "RepackedWeightsBufferExpression expects BrgemmCopyB as parent expression"); +} + +void InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::init_allocation_size(const std::shared_ptr& loop_manager, + size_t allocation_rank) { + const auto& parent_expr = get_input_port_connector(0)->get_source().get_expr(); + const auto& in_layout = parent_expr->get_input_port_descriptor(0)->get_layout(); + const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(parent_expr->get_input_port(0)); + + const size_t n_blk = *in_subtensor.rbegin(); + const size_t k_blk = *++in_subtensor.rbegin(); + OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(n_blk) && !ov::snippets::utils::is_dynamic_value(k_blk), + "RepackedWeightsBufferExpression supports only static subtensor values"); + + const auto& precision = get_node()->get_input_element_type(0); + // Repacking buffer shape is set in accordance to OneDNN requirements + const size_t N_dim = std::max(n_blk, compute_inner_n_block(precision)); + if (!in_layout.empty() && in_layout.back() != in_layout.size() - 1) { + // In case of transpose, K dimension must be rounded-up to number of elems in vector register + // For the details, please see 'transpose16x8' and 'fixup16x16' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp + const auto elems_in_vec = brgemm_utils::get_elems_in_vec(precision); + m_allocation_size = N_dim * rnd_up(k_blk, elems_in_vec); + } else { + // Low precision repacking writes the result by m_brgemmVNNIFactor * m_inner_n_block blocks + // despite the actual size of the input data. Because of that we have to round-up the allocation shape to always have enough memory allocated. + // For the details, please see 'copy_4x64' and 'copy_2x32' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp + const auto brgemmVNNIFactor = brgemm_utils::compute_vnni_factor(precision); + OPENVINO_ASSERT(brgemmVNNIFactor > 0, "brgemmVNNIFactor value must be positive."); + m_allocation_size = N_dim * rnd_up(k_blk, brgemmVNNIFactor); + } +} + +InsertBrgemmCopyBBuffers::CompensationsBufferExpression::CompensationsBufferExpression(const CompensationsBufferExpression& other) + : BufferExpression(other) {} + +InsertBrgemmCopyBBuffers::CompensationsBufferExpression::CompensationsBufferExpression(const std::shared_ptr& n, + const std::shared_ptr& factory) : BufferExpression(n, factory) {} + +snippets::lowered::ExpressionPtr InsertBrgemmCopyBBuffers::CompensationsBufferExpression::clone() const { + return std::shared_ptr(new CompensationsBufferExpression(*this)); +} + +void InsertBrgemmCopyBBuffers::CompensationsBufferExpression::validate() const { + BufferExpression::validate(); + OPENVINO_ASSERT(get_input_count() == 1, "CompensationsBufferExpression must have only one input"); + const auto& parent_out = get_input_port_connector(0)->get_source(); + OPENVINO_ASSERT(ov::is_type(parent_out.get_expr()->get_node()) && parent_out.get_index() == 1, + "CompensationsBufferExpression expects BrgemmCopyB as parent expression"); +} + +void InsertBrgemmCopyBBuffers::CompensationsBufferExpression::init_allocation_size(const std::shared_ptr& loop_manager, + size_t allocation_rank) { + const auto& parent_expr = get_input_port_connector(0)->get_source().get_expr(); + const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(parent_expr->get_input_port(0)); + const size_t n_blk = *in_subtensor.rbegin(); + OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(n_blk), "CompensationsBufferExpression supports only static subtensor values"); + const auto& precision = parent_expr->get_node()->get_input_element_type(0); + // Compensations are computed during repacking, so we need to round-up allocation shape according to m_inner_n_block + // because of OneDNN implementation nuances (as in get_repacking_buffer_size). + // However, the compensations are computed by N dimension, so K dimension doesn't affect the compensations buffer + m_allocation_size = std::max(n_blk, compute_inner_n_block(precision)); +} + +} // namespace pass +} // namespace intel_cpu +} // namespace ov + diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp new file mode 100644 index 00000000000000..c13a239a39851d --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp @@ -0,0 +1,65 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/pass/pass.hpp" + +#include "snippets/lowered/expressions/buffer_expression.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { + +/** + * @interface InsertBrgemmCopyBBuffers + * @brief Insert Buffers after BrgemmCopyB with algorithm of allocation size calculation which + * distinguishes with common algorithm + * @ingroup snippets + */ +class InsertBrgemmCopyBBuffers: public snippets::lowered::pass::RangedPass { +public: + InsertBrgemmCopyBBuffers() = default; + OPENVINO_RTTI("InsertBrgemmCopyBBuffers", "Pass"); + bool run(snippets::lowered::LinearIR& linear_ir, snippets::lowered::LinearIR::constExprIt begin, snippets::lowered::LinearIR::constExprIt end) override; + +private: + class RepackedWeightsBufferExpression : public snippets::lowered::BufferExpression { + friend class snippets::lowered::ExpressionFactory; + public: + OPENVINO_RTTI("RepackedWeightsBufferExpression", "0", BufferExpression) + RepackedWeightsBufferExpression() = default; + + void validate() const override; + + void init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) override; + + private: + RepackedWeightsBufferExpression(const RepackedWeightsBufferExpression& other); + RepackedWeightsBufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory); + + snippets::lowered::ExpressionPtr clone() const override; + }; + + class CompensationsBufferExpression : public snippets::lowered::BufferExpression { + friend class snippets::lowered::ExpressionFactory; + public: + OPENVINO_RTTI("CompensationsBufferExpression", "0", BufferExpression) + CompensationsBufferExpression() = default; + + void validate() const override; + + void init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) override; + + private: + CompensationsBufferExpression(const CompensationsBufferExpression& other); + CompensationsBufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory); + + snippets::lowered::ExpressionPtr clone() const override; + }; +}; + +} // namespace pass +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp deleted file mode 100644 index 332c0cccaf4acc..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/itt.hpp" - -#include "set_brgemm_copy_b_buffers_shape.hpp" -#include "snippets/snippets_isa.hpp" -#include "snippets/utils/utils.hpp" - -#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" -#include "transformations/snippets/x64/op/brgemm_utils.hpp" - -using namespace ov::intel_cpu::brgemm_utils::repacking; - -bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::LinearIR& linear_ir, - snippets::lowered::LinearIR::constExprIt begin, - snippets::lowered::LinearIR::constExprIt end) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SetBrgemmCopyBBuffersShape") - - auto get_buffer_from_output = [](const snippets::lowered::ExpressionPtr& expr, const size_t out_idx) { - const auto& consumers = expr->get_output_port_connector(out_idx)->get_consumers(); - OPENVINO_ASSERT(consumers.size() == 1, "BrgemmCopyB must have only 1 consumer"); - const auto buffer = ov::as_type_ptr(consumers.begin()->get_expr()->get_node()); - OPENVINO_ASSERT(buffer, "BrgemmCopyB consumer must be Buffer"); - return buffer; - }; - - bool modified = false; - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto& expr = *expr_it; - if (auto copy_b = ov::as_type_ptr(expr->get_node())) { - const auto buffer = get_buffer_from_output(expr, 0); - buffer->set_allocation_size(get_repacking_buffer_size(expr)); - if (with_compensations(copy_b->get_type())) { - const auto compensations_buffer = get_buffer_from_output(expr, 1); - compensations_buffer->set_allocation_size(get_compensations_buffer_size(expr)); - } - modified = true; - } - } - return modified; -} diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp deleted file mode 100644 index 1b348ecbf2740c..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "snippets/lowered/pass/pass.hpp" - -namespace ov { -namespace intel_cpu { -namespace pass { - -/** - * @interface SetBrgemmCopyBBuffersShape - * @brief Sets the allocation shape for the Buffers after BrgemmCopyB node using BrgemmCopyB parameters - * This pass may be deprecated when a more generic memory management approach is introduced. - * Ticket: 113744 - * @ingroup snippets - */ -class SetBrgemmCopyBBuffersShape: public snippets::lowered::pass::RangedPass { -public: - SetBrgemmCopyBBuffersShape() = default; - OPENVINO_RTTI("SetBrgemmCopyBBuffersShape", "Pass"); - bool run(snippets::lowered::LinearIR& linear_ir, - snippets::lowered::LinearIR::constExprIt begin, - snippets::lowered::LinearIR::constExprIt end) override; -}; - -} // namespace pass -} // namespace intel_cpu -} // namespace ov diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp index 4be2638e28b893..89f2e06c14a9fa 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp @@ -310,7 +310,7 @@ TEST_F(BrgemmCPUBlockingTest, AMX) { { auto data_a = linear_ir->push_node(precision, input_shape_a); auto data_b = linear_ir->push_node(precision, input_shape_b); - auto scratch = linear_ir->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); + auto scratch = linear_ir->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); auto copy_b = linear_ir->push_node(data_b.second, precision, BRGEMM_TYPE::REPACKING_ONLY); init_expr_descriptors(*copy_b.first); auto brgemm = linear_ir->push_node(data_a.second, copy_b.second, scratch.second, BRGEMM_TYPE::WITH_AMX); @@ -324,7 +324,7 @@ TEST_F(BrgemmCPUBlockingTest, AMX) { const auto copy_b_expr = *copy_b.first; init_expr_descriptors(copy_b_expr, {{full_dim, full_dim}, {full_dim, full_dim}}); - auto scratch = linear_ir_ref->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); + auto scratch = linear_ir_ref->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); scratch.first->get()->set_loop_ids({0}); auto brgemm = linear_ir_ref->push_node(data_a.second, copy_b.second, scratch.second, BRGEMM_TYPE::WITH_AMX); diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp index 5434ff228aa833..6dad1d4772f531 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp @@ -17,7 +17,7 @@ #include "transformations/snippets/x64/shape_inference.hpp" #include "transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp" -#include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp" +#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" @@ -85,22 +85,20 @@ class BufferAllocationCPUTest : public testing::TestWithParam(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); + pipeline.register_pass(); pipeline.register_pass(); pipeline.register_pass(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(); pipeline.register_pass(m_is_buffer_optimized); pipeline.run(m_linear_ir); } void Validate() { std::set reg_groups, clusters; - for (const auto& expr : m_linear_ir) { - if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - reg_groups.insert(buffer->get_reg_group()); - clusters.insert(buffer->get_cluster_id()); - } + for (const auto& buffer : m_linear_ir.get_buffers()) { + reg_groups.insert(buffer->get_reg_group()); + clusters.insert(buffer->get_cluster_id()); } EXPECT_EQ(reg_groups.size(), m_expected_reg_group_count); EXPECT_EQ(clusters.size(), m_expected_cluster_count); @@ -211,7 +209,7 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto convert1 = std::make_shared(relu0, ov::element::bf16); const auto brgemm_copyb0 = std::make_shared(convert1, ov::element::bf16); - const auto scratch0 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); + const auto scratch0 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); const auto brgemm_cpu0 = std::make_shared( parameter0, brgemm_copyb0->output(0), scratch0, BRGEMM_TYPE::WITH_AMX); @@ -231,7 +229,7 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto convert2 = std::make_shared(multiply, ov::element::bf16); const auto brgemm_copyb1 = std::make_shared(parameter2, ov::element::bf16); - const auto scratch1 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); + const auto scratch1 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); const auto brgemm_cpu1 = std::make_shared( convert2, brgemm_copyb1->output(0), scratch1, BRGEMM_TYPE::WITH_AMX);