diff --git a/src/common/snippets/include/snippets/lowered/expression.hpp b/src/common/snippets/include/snippets/lowered/expression.hpp index a04368e5605435..286f561b5bcb03 100644 --- a/src/common/snippets/include/snippets/lowered/expression.hpp +++ b/src/common/snippets/include/snippets/lowered/expression.hpp @@ -17,15 +17,18 @@ namespace ov { namespace snippets { namespace lowered { +class ExpressionFactory; class LinearIR; using ExpressionPtr = std::shared_ptr; using ExpressionMap = std::unordered_map; class Expression : public std::enable_shared_from_this { friend class LinearIR; + friend class ExpressionFactory; friend class ExpressionPort; public: Expression() = default; + virtual ~Expression() = default; std::shared_ptr get_node() const; std::shared_ptr get_emitter() const; @@ -50,7 +53,8 @@ class Expression : public std::enable_shared_from_this { void set_input_port_connector(size_t port, PortConnectorPtr to); - void validate() const; + // Attention! Cannot be called in ctor because this method validats port attributes (descs, connectors) + virtual void validate() const; ExpressionPort get_input_port(size_t i); ExpressionPort get_output_port(size_t i); @@ -61,16 +65,52 @@ class Expression : public std::enable_shared_from_this { bool needShapeInfer() const { return m_need_shape_infer; } const std::vector& get_loop_ids() const; void set_loop_ids(const std::vector& loops); - ExpressionPtr clone_with_new_inputs(const std::vector& new_inputs, - const std::shared_ptr& new_node) const; + + /** + * @brief Clone Expression with new node and input port attributes/ + * Output port descriptors will be cloned from the current expression. + * Output port connecters will be created. + * @param new_node new node + * @param new_inputs new input port connectors + * @param new_in_descs new input port descriptors. If this collection is empty, + * descriptors will be copied from the current expression + * @return the copy + */ + ExpressionPtr clone_with_new_inputs(const std::shared_ptr& new_node, const std::vector& new_inputs, + const std::vector& new_in_descs = {}) const; + /** + * @brief Clone Expression with new node using `expr_map` to connect to new parent expressions. + * @param expr_map the map with the original and cloned expressions + * @param new_node new node + * @return the copy + */ ExpressionPtr clone_with_new_inputs(const ExpressionMap& expr_map, const std::shared_ptr& new_node) const; + virtual bool visit_attributes(AttributeVisitor &visitor); + + // Note that get_type_info_static and get_type_info are needed to mimic OPENVINO_RTTI interface, + // so the standard OPENVINO_RTTI(...) macros could be used in derived classes. + _OPENVINO_HIDDEN_METHOD static const ::ov::DiscreteTypeInfo& get_type_info_static() { + static ::ov::DiscreteTypeInfo type_info_static {"Expression"}; + type_info_static.hash(); + return type_info_static; + } + + virtual const DiscreteTypeInfo& get_type_info() const { + return get_type_info_static(); + } + + const char* get_type_name() const { + return get_type_info().name; + } + protected: - Expression(const Expression& other); // Note: The constructor initialization is private since an expression can be created only by Linear IR. // The method must be used only by Linear IR builder of expressions! Expression(const std::shared_ptr& n, const std::shared_ptr& factory, bool need_shape_infer = true); - void update_node_and_connectors(const std::vector& new_inputs, const std::shared_ptr& new_node); + + // Virtual clone method which is called in clone_with_new_inputs with common logic + virtual ExpressionPtr clone() const; std::shared_ptr m_source_node{nullptr}; std::shared_ptr m_emitter{nullptr}; diff --git a/src/common/snippets/include/snippets/lowered/expression_factory.hpp b/src/common/snippets/include/snippets/lowered/expression_factory.hpp index ca45fe936e0500..d617eb3d03b410 100644 --- a/src/common/snippets/include/snippets/lowered/expression_factory.hpp +++ b/src/common/snippets/include/snippets/lowered/expression_factory.hpp @@ -4,65 +4,72 @@ #pragma once -#include "linear_ir.hpp" +#include "expression.hpp" +#include "expressions/buffer_expression.hpp" -#include "snippets/snippets_isa.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/op/buffer.hpp" +#include "snippets/op/perf_count.hpp" namespace ov { namespace snippets { namespace lowered { -class LinearIR::ExpressionFactory { +class ExpressionFactory { public: - template - static ExpressionPtr build(const std::shared_ptr& n, Args&&... params) { - if (const auto par = ov::as_type_ptr(n)) { - return create(par, params...); - } else if (const auto res = ov::as_type_ptr(n)) { - return create(res, params...); - } else if (const auto loop_begin = ov::as_type_ptr(n)) { - return create(loop_begin, params...); - } else if (const auto loop_end = ov::as_type_ptr(n)) { - return create(loop_end, params...); -#ifdef SNIPPETS_DEBUG_CAPS - } else if (const auto perf_counter = ov::as_type_ptr(n)) { - return create(perf_counter, params...); - } else if (const auto perf_counter = ov::as_type_ptr(n)) { - return create(perf_counter, params...); -#endif - } - return create(n, params...); + ExpressionFactory(std::shared_ptr shape_infer_factory) + : m_shape_infer_factory(std::move(shape_infer_factory)) {} + + template ::value, bool>::type = true> + std::shared_ptr build(const std::shared_ptr& n, const std::vector& inputs, Args... args) { + return create(n, inputs, m_shape_infer_factory, args...); } private: - /* -- Default Builders - initialize input port connectors from parents and create new output port connectors themselves */ - static ExpressionPtr create(const std::shared_ptr& par, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& res, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, const LinearIR& linear_ir); - - /* -- Input Builders - get input port connectors from method parameters and create new output port connectors themselves */ - static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, const LinearIR& linear_ir); + static ExpressionPtr create(const std::shared_ptr& par, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create(const std::shared_ptr& res, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); // Note: PerfCountBegin nodes have a PerfCountEnd ov::Output, but corresponding expression should not have any outputs to avoid register allocation #ifdef SNIPPETS_DEBUG_CAPS - static ExpressionPtr create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir); - static ExpressionPtr create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir); - static ExpressionPtr create_without_connections(const std::shared_ptr& n, const LinearIR& linear_ir); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory); + static ExpressionPtr create_without_connections(const std::shared_ptr& n, const std::shared_ptr& shape_infer_factory); #endif - // Creates inputs for expression using parent output port connectors - static void create_expression_inputs(const LinearIR& linear_ir, const ExpressionPtr& expr); + template ::value, bool>::type = true> + static std::shared_ptr create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory, Args... args) { + auto expr = std::shared_ptr(new T(n, shape_infer_factory, args...)); + init_expression_inputs(expr, inputs); + create_expression_outputs(expr); + expr->validate(); + // todo: here we blindly synchronize input shapes from parent and child. Remove this when shapes will be stored in port connector itself + if (shape_infer_factory) + expr->updateShapes(); + return expr; + } + // Creates new output port connectors static void create_expression_outputs(const ExpressionPtr& expr); // The method verifies of input port connectors to availability of the expression as consumer and add it if missed static void init_expression_inputs(const ExpressionPtr& expr, const std::vector& inputs); + + const std::shared_ptr m_shape_infer_factory = nullptr; }; +using ExpressionFactoryPtr = std::shared_ptr; + +template<> +std::shared_ptr ExpressionFactory::build(const std::shared_ptr& n, const std::vector& inputs); } // namespace lowered } // namespace snippets diff --git a/src/common/snippets/include/snippets/lowered/expressions/buffer_expression.hpp b/src/common/snippets/include/snippets/lowered/expressions/buffer_expression.hpp new file mode 100644 index 00000000000000..3dcd98ef0a95fd --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/expressions/buffer_expression.hpp @@ -0,0 +1,69 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/expression.hpp" + +#include "snippets/utils/utils.hpp" + + +namespace ov { +namespace snippets { +namespace lowered { + +// To avoid cycle-dependancy of includes, we forward-declare LoopManager +class LoopManager; +/** + * @interface BufferExpression + * @brief This is a base class for memory storage. + * Note that Buffer should be a single consumer for operation output port + * @param m_allocation_size - memory size for allocation in bytes. Dynamic value means undefined size. + * @param m_offset - offset in common Buffer scratchpad + * @param m_reg_group - number of register group. The Buffers from the same register group will have the same GPR + * @param m_cluster_id - number of cluster. The Buffers from the same cluster shares memory between them and will have the same offset. + * @ingroup snippets + */ +class BufferExpression : public Expression { + friend class ExpressionFactory; +public: + OPENVINO_RTTI("BufferExpression", "0", Expression) + BufferExpression() = default; + + bool visit_attributes(AttributeVisitor &visitor) override; + + size_t get_reg_group() const { return m_reg_group; } + size_t get_cluster_id() const { return m_cluster_id; } + size_t get_offset() const { return m_offset; } + size_t get_allocation_size() const { return m_allocation_size; } + size_t get_byte_size() const; + + void set_reg_group(size_t reg_group) { m_reg_group = reg_group; } + void set_cluster_id(size_t cluster) { m_cluster_id = cluster; } + void set_allocation_size(size_t size) { m_allocation_size = size; } + void set_offset(size_t offset) { m_offset = offset; } + + virtual void init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank); + + // Returns True, if allocation size is known. Otherwise returns False - allocation size is undefined + bool is_defined() const; + + // Returns True, if the memory is independent - expression doesn't have parents (source) + bool is_independent_memory() const { return get_input_count() == 0; } + +protected: + BufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory); + + ExpressionPtr clone() const override; + + size_t m_allocation_size = utils::get_dynamic_value(); + size_t m_reg_group = 0; + size_t m_cluster_id = 0; + size_t m_offset = utils::get_dynamic_value(); +}; +using BufferExpressionPtr = std::shared_ptr; + +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/linear_ir.hpp b/src/common/snippets/include/snippets/lowered/linear_ir.hpp index 55afd2c9ccd7ab..7f9436bb6ac43b 100644 --- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp +++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp @@ -7,6 +7,8 @@ #include #include "snippets/lowered/expression.hpp" +#include "snippets/lowered/expression_factory.hpp" +#include "snippets/lowered/expressions/buffer_expression.hpp" #include "snippets/target_machine.hpp" #include "snippets/shape_inference/shape_inference.hpp" #ifdef SNIPPETS_DEBUG_CAPS @@ -51,7 +53,6 @@ using LoopManagerPtr = std::shared_ptr; */ class LinearIR { friend class LinearIRBuilder; - class ExpressionFactory; public: using container = std::list; using exprIt = container::iterator; @@ -62,12 +63,12 @@ class LinearIR { LinearIR(Config config = {}, const std::shared_ptr& factory = {}); LinearIR(const std::shared_ptr& m, const std::shared_ptr& factory, Config config = {}); - ExpressionPtr create_expression(const std::shared_ptr& n, const std::vector& inputs) const; + const ExpressionFactoryPtr& get_expr_factory() const; const container& get_ops() const { return m_expressions; } - const container& get_buffers() const { return m_buffer_expressions; } - const container& get_parameters() const { return m_parameter_expressions; } - const container& get_results() const { return m_result_expressions; } + const std::vector& get_parameters() const { return m_parameter_expressions; } + const std::vector& get_results() const { return m_result_expressions; } + const std::vector& get_buffers() const { return m_buffer_expressions; } const Config& get_config() const { return m_config; } size_t get_static_buffer_scratchpad_size() const { return m_static_buffer_scratchpad_size; } @@ -186,6 +187,20 @@ class LinearIR { return std::make_pair(expr_it, node); } + /** + * @brief Insert new Expression to LinearIR, sets `loops_ids` as loop identifiers and inserts the expression on the `place` in LinearIR. + * Also connects output ports to `consumers` + * @param new_expr the target expr which were created by ExpressionFactory + * @param loop_ids vector of loops ids that will be set for the expression + * @param update_loop_ports true - the helpers updates the corresponding loop ports after insertion otherwise - skip + * @param place before this place expression will be inserted + * @param consumers vector of expression port sets. These expression ports will be consumers of the expression. + * The vector may be empty or size of vector must be equal to output port count + * @return new expression iterator in LinearIR + */ + exprIt insert_expr(const ExpressionPtr& new_expr, const std::vector& loop_ids, + bool update_loop_ports, const constExprIt& place, const std::vector>& consumers); + /** * @brief Replace the several existing expressions with the one new expression that contains `new_node`. * Calls the helper `insert_node` and performs substitution: removes `old_exprs`. @@ -248,21 +263,22 @@ class LinearIR { private: class LIRShapeInfer : public ShapeInferSnippetsNode { public: - explicit LIRShapeInfer(const container& body_exprs, const container& param_exprs, const container& result_exprs); + explicit LIRShapeInfer(const container& body_exprs, const std::vector& param_exprs, const std::vector& result_exprs); Result infer(const std::vector& input_shapes) override; private: const container& m_exprs; - const container& m_input_exprs; - const container& m_output_exprs; + const std::vector& m_input_exprs; + const std::vector& m_output_exprs; }; static ov::NodeVector get_ordered_ops(const std::shared_ptr& model); - // Default way: expr port connectors are constructed basing on ov::Node connection - ExpressionPtr create_expression(const std::shared_ptr& n); ExpressionPtr create_expression(const std::shared_ptr& n, const std::vector& new_inputs, const std::vector& loop_ids, bool update_loop_ports, const std::vector>& consumers = {}); + // Creates inputs for expression using parent output port connectors + std::vector get_expression_inputs_by_node(const std::shared_ptr& n) const; + void register_expression(const ExpressionPtr& expr, bool io_allowed, double exec_num); void unregister_expression(const ExpressionPtr& expr); @@ -271,13 +287,16 @@ class LinearIR { container m_expressions{}; std::unordered_map, std::shared_ptr> m_node2expression_map; - container m_parameter_expressions{}; - container m_result_expressions{}; - container m_buffer_expressions{}; + // Note: Parameters and Results are stored in the order of Subgraph inputs/outputs + std::vector m_parameter_expressions{}; + std::vector m_result_expressions{}; + // Note: BufferExpressions are not stored in the order of execution numbers + std::vector m_buffer_expressions{}; Config m_config{}; LoopManagerPtr m_loop_manager; - std::shared_ptr m_shape_infer_factory; + std::shared_ptr m_shape_infer_factory = nullptr; std::shared_ptr m_shape_infer = nullptr; + std::shared_ptr m_expression_factory = nullptr; bool m_is_dynamic = false; // Size of static Buffer Scratchpad (Buffers with defined allocation size) diff --git a/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp b/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp index 830956338ef4a1..01d8b3ee85261e 100644 --- a/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/compute_buffer_allocation_size.hpp @@ -22,14 +22,9 @@ namespace pass { class ComputeBufferAllocationSize : public RangedPass { public: OPENVINO_RTTI("ComputeBufferAllocationSize", "RangedPass") - ComputeBufferAllocationSize(size_t buffer_allocation_rank) : m_buffer_allocation_rank(buffer_allocation_rank) {} + ComputeBufferAllocationSize() = default; bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; - - static size_t get_allocation_size(const LoopManagerPtr& loop_manager, const ExpressionPtr& buffer_expr, size_t allocation_rank); - -private: - const size_t m_buffer_allocation_rank = 0; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp index 824b0d4daea75d..1597eaa2377a50 100644 --- a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp @@ -43,27 +43,27 @@ class DefineBufferClusters : public RangedPass { bool run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; private: - using BufferCluster = std::set; + using BufferCluster = std::set; using BufferClusters = std::vector; - using BufferPorts = std::unordered_map>; + using BufferPorts = std::unordered_map>; /** * @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer * @param target target expression with Buffer op * @return vector iterator which refers to the found cluster */ - BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target); + BufferClusters::iterator find_cluster_by_expr(const BufferExpressionPtr& target); /** * @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr) * @param buffer_expr expression with assumed Buffer op * @param target_expr expression with target op - LoopEnd or MemoryAccess op * @return boolean value */ - bool is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const; + bool is_direct_buffer(const BufferExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const; /** * @brief Creates new buffer cluster if buffer_exprs is missed in clusters. If buffer_exprs is already in clusters, do nothing * @param buffer_expr expression with Buffer op */ - void create_new_cluster(const ExpressionPtr& buffer_expr); + void create_new_cluster(const BufferExpressionPtr& buffer_expr); /** * @brief Returns common ID of cluster if all buffer inside have the same Buffer ID. Otherwise returns the default value SIZE_MAX * that means that Buffers in cluster have different IDs. @@ -106,7 +106,7 @@ class DefineBufferClusters : public RangedPass { * @param buffer_expr expression with Buffer op * @return finalization offset - int64_t value */ - int64_t get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const; + int64_t get_buffer_finalization_offset(const BufferExpressionPtr& buffer_expr) const; /** * @brief Check if two Buffer expressions are connected to the same Loop. Set common LoopEnd as `loop` parameter and * indexes of Loop ports `up_idx` and `down_idx` if Buffers are really neighbours @@ -117,7 +117,8 @@ class DefineBufferClusters : public RangedPass { * @param down_idx the reference to port index of lower Buffer op to the Loop * @return Return True if the Buffers are connected to the same Loop */ - static bool are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx); + static bool are_buffer_neighbours(const BufferExpressionPtr& up, const BufferExpressionPtr& down, ExpressionPtr& loop, + size_t& up_idx, size_t& down_idx); /** * @brief Unite clusters * @param inner_cluster_it iterator to inner cluster - buffer cluster is in the loop @@ -127,7 +128,7 @@ class DefineBufferClusters : public RangedPass { * @return Return True if clusters have been united */ bool unite_nested_clusters(const BufferClusters::iterator& inner_cluster_it, BufferCluster& outer_cluster, - const ExpressionPtr& outer_buffer, bool is_outer_up); + const BufferExpressionPtr& outer_buffer, bool is_outer_up); BufferClusters m_clusters; }; diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_buffer_offset.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_buffer_offset.hpp index a602569d793a55..d895b3a60cd26d 100644 --- a/src/common/snippets/include/snippets/lowered/pass/propagate_buffer_offset.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/propagate_buffer_offset.hpp @@ -34,7 +34,7 @@ class PropagateBufferOffset: public Pass { * @brief Propagates Buffer offset to the connected memory access ops * @param buffer_expr expression with Buffer op with inited offset */ - static void propagate(const ExpressionPtr& buffer_expr); + static void propagate(const BufferExpressionPtr& buffer_expr); }; } // namespace pass diff --git a/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp b/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp index 8faf2419a0a313..cba3f28856be42 100644 --- a/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/set_buffer_reg_group.hpp @@ -64,8 +64,8 @@ class SetBufferRegGroup: public RangedPass { static bool can_be_in_one_group(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs); private: - using BufferPool = std::vector; - using BufferMap = std::map; + using BufferPool = std::vector; + using BufferMap = std::map; /** * @brief Get Buffer Index in Buffer set @@ -73,7 +73,7 @@ class SetBufferRegGroup: public RangedPass { * @param pool set of Buffers from the Linear IR * @return index of target Buffer expression in set */ - static size_t get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool); + static size_t get_buffer_idx(const BufferExpressionPtr& target, const BufferPool& pool); /** * @brief Create adjacency matrix for Buffer system. See comment in the method for more details. * @param linear_ir the target Linear IR @@ -99,8 +99,8 @@ class SetBufferRegGroup: public RangedPass { * @param buffers set of Buffers from the Linear IR * @param adj Target adjacency matrix */ - static void update_adj_matrix(const std::pair& lhs, - const std::pair& rhs, + static void update_adj_matrix(const std::pair& lhs, + const std::pair& rhs, const BufferPool& buffers, std::vector& adj); /** @@ -109,8 +109,8 @@ class SetBufferRegGroup: public RangedPass { * @param rhs Pair where first value is Expression with second Buffer and second value is data pointer shift params for it * @return Returns True if they are adjacent, otherwise returns False */ - static bool are_adjacent(const std::pair& lhs, - const std::pair& rhs); + static bool are_adjacent(const std::pair& lhs, + const std::pair& rhs); /** * @brief Find all buffers that are connected to the current LoopEnd diff --git a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp index 74f2994deec971..71b5f4ba6c6f96 100644 --- a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp @@ -35,32 +35,33 @@ class SolveBufferMemory : public Pass { bool run(lowered::LinearIR& linear_ir) override; private: + using Buffers = std::vector; /** * @brief Split buffer expressions of Linear IR into * static (with defined allocation size) and dynamic (with unknown size) buffers * @param buffer_expressions buffer expressions * @return the pair of static and dynamic buffer expressions */ - std::pair extract_static_and_dynamic_buffers(const LinearIR::container& buffer_expressions); + std::pair extract_static_and_dynamic_buffers(const Buffers& buffer_expressions); /** * @brief Initializes boxes for MemorySolver * @param buffer_expressions buffer expressions * @param linear_ir linear ir * @return vector of boxes for MemorySolver */ - std::vector init_boxes(const LinearIR::container& buffer_expressions, const LinearIR& linear_ir); + std::vector init_boxes(const Buffers& buffer_expressions, const LinearIR& linear_ir); /** * @brief Calculate memory size and set offset to buffer with defined allocation size * @param static_buffer_expressions static buffer expressions * @param linear_ir linear ir */ - void solve_static_buffer_memory(const LinearIR::container& static_buffer_expressions, const LinearIR& linear_ir); + void solve_static_buffer_memory(const Buffers& static_buffer_expressions, const LinearIR& linear_ir); /** * @brief Initialize offset for Buffer with undefined allocation size * Note: should be called after `solve_static_buffer_memory` * @param dynamic_buffer_expressions dynamic buffer expressions */ - void set_dynamic_buffer_offset(const LinearIR::container& dynamic_buffer_expressions); + void set_dynamic_buffer_offset(const Buffers& dynamic_buffer_expressions); size_t& m_static_buffer_scratchpad_size; diff --git a/src/common/snippets/include/snippets/lowered/pass/validate_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/validate_buffers.hpp new file mode 100644 index 00000000000000..b87697d054e4fb --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/validate_buffers.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "pass.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +/** + * @interface ValidateBuffers + * @brief The pass validates buffer expression in Linear IR state + * @ingroup snippets + */ +class ValidateBuffers : public RangedPass { +public: + OPENVINO_RTTI("ValidateBuffers", "Pass") + ValidateBuffers() = default; + bool run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) override; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp index e990a31d28b6c0..8236413fa74088 100644 --- a/src/common/snippets/include/snippets/op/buffer.hpp +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -14,94 +14,98 @@ namespace op { /** * @interface Buffer - * @brief This is a base class for memory storage. - * Notes: - * - All buffers with the same reg_group in a graph have the same memory pointer. So if we have a few buffers, - * each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer - * - Buffer should be a single consumer for operation output port - * @param m_allocation_size - memory size for allocation in bytes. Dynamic value means undefined size. - * @param m_offset - offset in common Buffer scratchpad - * @param m_reg_group - number of register group. The Buffers from the same register group will have the same GPR - * @param m_cluster_id - number of cluster. The Buffers from the same cluster shares memory between them and will have the same offset. + * @brief This is a class for memory storage. + * The buffers can have source (called as "IntermediateMemory") and can be without source (called as "NewMemory"). + * First one contains memory which was stored by source -> these buffers propagate output shape and element type from parents to output. + * Second one has passed `element_type` and `shape` by user - these attributes describe independent empty memory. + * The both behaviors are implemented via the corresponding classes which are derived from the class "Buffer::BaseImpl". + * It allows user to work with only the class "op::Buffer" - all needed logic is implemented in the field `m_impl`. * @ingroup snippets */ class Buffer : public ov::op::Op { public: OPENVINO_OP("Buffer", "SnippetsOpset"); Buffer() = default; - Buffer(const OutputVector& arguments, size_t allocation_size = utils::get_dynamic_value(), size_t reg_group = 0, size_t cluster_id = 0); + Buffer(const ov::Output& arg); + Buffer(const OutputVector& arguments); + Buffer(const ov::Shape& shape, ov::element::Type element_type = ov::element::u8); bool visit_attributes(AttributeVisitor& visitor) override; - size_t get_reg_group() const { return m_reg_group; } - size_t get_cluster_id() const { return m_cluster_id; } - size_t get_offset() const { return m_offset; } - size_t get_allocation_size() const { return m_allocation_size; } - size_t get_byte_size() const; - - void set_reg_group(size_t reg_group) { m_reg_group = reg_group; } - void set_cluster_id(size_t cluster) { m_cluster_id = cluster; } - void set_allocation_size(size_t allocation_size) { m_allocation_size = allocation_size; } - void set_offset(size_t offset) { m_offset = offset; } - - // Returns True, if allocation size is known. Otherwise returns False - allocation size is undefined - bool is_defined() const; - -protected: - size_t m_allocation_size = utils::get_dynamic_value(); - size_t m_reg_group = 0; - size_t m_cluster_id = 0; - size_t m_offset = utils::get_dynamic_value(); -}; - -/** - * @interface IntermediateMemoryBuffer - * @brief Represents an intermediate memory storage operation. It always has a parent. - * @ingroup snippets - * - */ -class IntermediateMemoryBuffer : public Buffer { -public: - OPENVINO_OP("IntermediateMemoryBuffer", "SnippetsOpset", Buffer); - IntermediateMemoryBuffer() = default; - IntermediateMemoryBuffer(const OutputVector& arguments, size_t allocation_size = utils::get_dynamic_value(), - size_t reg_group = 0, size_t cluster_id = 0); - IntermediateMemoryBuffer(const ov::Output& arg, size_t allocation_size = utils::get_dynamic_value(), - size_t reg_group = 0, size_t cluster_id = 0); - void validate_and_infer_types() override; - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; -}; -/** - * @interface NewMemoryBuffer - * @brief Represents a new empty memory for allocation with specified shape. It has no parent operations. - * @ingroup snippets - * - */ -class NewMemoryBuffer : public Buffer { -public: - OPENVINO_OP("NewMemoryBuffer", "SnippetsOpset", Buffer); - NewMemoryBuffer() = default; - NewMemoryBuffer(const ov::Shape& shape, size_t reg_group = 0, size_t cluster_id = 0, ov::element::Type element_type = ov::element::u8); - - void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - void set_element_type(ov::element::Type element_type); + size_t get_allocation_size() const { return m_impl->get_allocation_size(); } class ShapeInfer : public IShapeInferSnippets { - ov::Shape m_shape; + std::shared_ptr m_impl_shape_infer {nullptr}; public: explicit ShapeInfer(const std::shared_ptr& n); Result infer(const std::vector& input_shapes) override; }; private: - ov::Shape m_output_shape; - ov::element::Type m_element_type = ov::element::u8; // u8 - default 1 byte + // Base class for implementations of Buffer + class BaseImpl { + public: + BaseImpl() = default; + virtual ~BaseImpl() = default; + virtual size_t get_allocation_size() const = 0; + virtual std::shared_ptr clone() const = 0; + virtual void validate_and_infer_types(Buffer* buffer) const = 0; + virtual bool visit_attributes(AttributeVisitor& visitor) = 0; + virtual std::shared_ptr get_shape_infer() const = 0; + }; + + // IntermediateMemoryImpl represents intermediate memory. + // The buffers with this implementation must have source (parents) + class IntermediateMemoryImpl : public BaseImpl { + public: + IntermediateMemoryImpl() = default; + + size_t get_allocation_size() const override { return utils::get_dynamic_value(); } + std::shared_ptr clone() const override; + void validate_and_infer_types(Buffer* buffer) const override; + bool visit_attributes(AttributeVisitor& visitor) override { return true; } + std::shared_ptr get_shape_infer() const override { return std::make_shared(); } + private: + class ShapeInfer : public IShapeInferSnippets { + public: + Result infer(const std::vector& input_shapes) override; + }; + }; + + // NewMemoryImpl represents a new empty memory for allocation with specified shape and element type. + // The buffers with this implementation mustn't have source (parents) + class NewMemoryImpl : public BaseImpl { + public: + NewMemoryImpl(const ov::Shape& shape, ov::element::Type element_type); + + size_t get_allocation_size() const override; + std::shared_ptr clone() const override; + void validate_and_infer_types(Buffer* buffer) const override; + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr get_shape_infer() const override { return std::make_shared(m_shape); } + private: + class ShapeInfer : public IShapeInferSnippets { + ov::Shape m_shape; + public: + explicit ShapeInfer(ov::Shape shape); + Result infer(const std::vector& input_shapes) override; + }; + + ov::Shape m_shape; + ov::element::Type m_element_type = ov::element::u8; // u8 - default 1 byte + }; + + // This constructor is used only in clone_with_new_inputs + Buffer(const OutputVector& arguments, std::shared_ptr impl); + + const std::shared_ptr m_impl {nullptr}; }; + } // namespace op } // namespace snippets } // namespace ov diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index e782d130a82dca..a2c605fd8b5356 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -201,7 +201,7 @@ class RuntimeConfigurator { std::vector m_io_descs = {}; std::vector m_io_data_sizes = {}; // [cluster_id -> buffer expressions ] - std::map> m_dynamic_buffer_clusters = {}; + std::map> m_dynamic_buffer_clusters = {}; std::vector m_latest_shapes = {}; }; diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 7ba5e830fd3362..d059ddd94d5724 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -29,7 +29,7 @@ LoweringResult Generator::generate(const lowered::LinearIRPtr& linear_ir, const const auto kernel_op = op::Kernel::make_kernel(*linear_ir); kernel_op->compile_params = compile_params; - const auto kernel_expr = linear_ir->create_expression(kernel_op, std::vector{}); + const auto kernel_expr = linear_ir->get_expr_factory()->build(kernel_op, std::vector{}); const auto kernel = target->get(kernel_expr->get_node()->get_type_info())(kernel_expr); kernel->emit_code({}, {}); @@ -74,8 +74,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output& out) const { std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) diff --git a/src/common/snippets/src/lowered/expression.cpp b/src/common/snippets/src/lowered/expression.cpp index 3c4391da3a7250..aaa71612cef706 100644 --- a/src/common/snippets/src/lowered/expression.cpp +++ b/src/common/snippets/src/lowered/expression.cpp @@ -27,23 +27,6 @@ Expression::Expression(const std::shared_ptr& n, const std::shared_ptr(other), m_source_node(other.m_source_node), m_emitter(other.m_emitter), - m_loop_ids(other.m_loop_ids), m_shapeInference(other.m_shapeInference), m_need_shape_infer(other.m_need_shape_infer), - m_exec_num(other.m_exec_num) { - auto clone_ports_descriptors = [](const std::vector& src, std::vector& dst) { - dst.resize(src.size()); - for (size_t i = 0; i < src.size(); i++) - dst[i] = src[i]->clone(); - }; - clone_ports_descriptors(other.m_input_port_descriptors, m_input_port_descriptors); - clone_ports_descriptors(other.m_output_port_descriptors, m_output_port_descriptors); - // Note that connectors are not filled on purpose, since you need a shared pointer to this to initialize them, - // which is not available in constructor. Also, an expression copy is rarely expected to use the same connectors. - m_input_port_connectors = {}; - m_output_port_connectors = {}; -} - const PortConnectorPtr& Expression::get_input_port_connector(size_t i) const { OPENVINO_ASSERT(i < m_input_port_connectors.size(), "Failed to get input port connector: target input port must be less than input count!"); return m_input_port_connectors[i]; @@ -97,12 +80,12 @@ void Expression::set_reg_info(const RegInfo& rinfo) { } void Expression::validate() const { + OPENVINO_ASSERT(m_source_node != nullptr, + "The expression has null source node"); OPENVINO_ASSERT(m_input_port_descriptors.size() == m_input_port_connectors.size(), "The count of input ports and input port connectors must be equal"); OPENVINO_ASSERT(m_output_port_descriptors.size() == m_output_port_connectors.size(), "The count of output ports and output port connectors must be equal"); - OPENVINO_ASSERT(m_source_node != nullptr, - "The expression has null source node"); } void Expression::set_input_port_connector(size_t port, PortConnectorPtr to) { @@ -130,31 +113,39 @@ void Expression::set_loop_ids(const std::vector& loops) { m_loop_ids = loops; } -void Expression::update_node_and_connectors(const std::vector& new_inputs, - const std::shared_ptr& new_node) { +ExpressionPtr Expression::clone_with_new_inputs(const std::shared_ptr& new_node, + const std::vector& new_inputs, + const std::vector& new_in_descs) const { + auto clone_ports_descriptors = [](const std::vector& src) { + std::vector dst(src.size()); + for (size_t i = 0; i < src.size(); i++) + dst[i] = src[i]->clone(); + return dst; + }; + const auto& cloned = clone(); OPENVINO_ASSERT(m_source_node->get_type_info() == new_node->get_type_info(), "Can't clone expression for a new node with incompatible type"); - m_source_node = new_node; - OPENVINO_ASSERT(new_inputs.size() == m_input_port_descriptors.size(), + cloned->m_source_node = new_node; + + // Initialize Port Attributes: PortConnectors and PortDescriptors + OPENVINO_ASSERT(new_in_descs.empty() || new_inputs.size() == new_in_descs.size(), "Can't create Expression with new inputs: invalid number of input port connectors passed"); - m_input_port_connectors = new_inputs; - for (size_t i = 0; i < m_input_port_descriptors.size(); i++) { - const auto& i_con = new_inputs[i]; - const auto& i_port = get_input_port(i); + cloned->m_input_port_descriptors = !new_in_descs.empty() ? new_in_descs : clone_ports_descriptors(m_input_port_descriptors); + cloned->m_input_port_connectors = new_inputs; + for (size_t i = 0; i < cloned->m_input_port_connectors.size(); i++) { + const auto& i_con = cloned->m_input_port_connectors[i]; + const auto& i_port = cloned->get_input_port(i); if (!i_con->found_consumer(i_port)) i_con->add_consumer(i_port); } - m_output_port_connectors.resize(m_output_port_descriptors.size()); - for (size_t i = 0; i < m_output_port_descriptors.size(); i++) { - m_output_port_connectors[i] = std::make_shared(get_output_port(i)); - } -} + cloned->m_output_port_descriptors = clone_ports_descriptors(m_output_port_descriptors); + OPENVINO_ASSERT(cloned->m_output_port_connectors.size() == cloned->m_output_port_descriptors.size(), + "Can't create Expression with new inputs: output port attributes are not compatible"); + for (size_t i = 0; i < cloned->m_output_port_descriptors.size(); i++) + cloned->m_output_port_connectors[i] = std::make_shared(cloned->get_output_port(i)); -ExpressionPtr Expression::clone_with_new_inputs(const std::vector& new_inputs, - const std::shared_ptr& new_node) const { - const auto& expr = std::shared_ptr(new Expression(*this)); - expr->update_node_and_connectors(new_inputs, new_node); - return expr; + cloned->validate(); + return cloned; } ExpressionPtr Expression::clone_with_new_inputs(const ExpressionMap& expr_map, @@ -171,7 +162,89 @@ ExpressionPtr Expression::clone_with_new_inputs(const ExpressionMap& expr_map, new_inputs.emplace_back(input); } } - return clone_with_new_inputs(new_inputs, new_node); + return clone_with_new_inputs(new_node, new_inputs); +} + +ExpressionPtr Expression::clone() const { + return std::shared_ptr(new Expression(*this)); +} + +bool Expression::visit_attributes(AttributeVisitor &visitor) { + auto is_planar_layout = [](const std::vector& layout) { + for (size_t i = 0; i < layout.size(); ++i) + if (layout[i] != i) return false; + return true; + }; + auto subtensor2str = [](const VectorDims& subtensor) { + std::stringstream ss; + for (size_t i = 0; i < subtensor.size(); ++i) { + const auto& v = subtensor[i]; + const auto v_str = utils::is_full_dim_value(v) ? "FULL_DIM" : + utils::is_dynamic_value(v) ? "?" : std::to_string(v); + const auto del = i < subtensor.size() - 1 ? ", " : ""; + ss << v_str << del; + } + return ss.str(); + }; + + std::vector in_regs, out_regs; + std::vector in_reg_types, out_reg_types; + std::vector> shapes; + std::vector> subtensors; + std::vector>> layouts; + for (size_t i = 0; i < get_input_count(); i++) { + const auto& desc = m_input_port_descriptors[i]; + const auto& shape = desc->get_shape(); + if (!shape.empty()) + shapes.emplace_back("in_shape_" + std::to_string(i), ov::PartialShape(shape)); + + const auto& subtensor = desc->get_subtensor(); + if (!subtensor.empty()) + subtensors.emplace_back("in_subtensor_" + std::to_string(i), subtensor2str(subtensor)); + + const auto& layout = desc->get_layout(); + if (!layout.empty() && !is_planar_layout(layout)) + layouts.emplace_back("in_layout_" + std::to_string(i), layout); + + in_reg_types.emplace_back(regTypeToStr(desc->get_reg().type)); + in_regs.emplace_back(desc->get_reg().idx); + } + for (size_t i = 0; i < get_output_count(); i++) { + const auto& desc = m_output_port_descriptors[i]; + const auto& shape = desc->get_shape(); + if (!shape.empty()) + shapes.emplace_back("out_shape_" + std::to_string(i), ov::PartialShape(shape)); + + const auto& subtensor = desc->get_subtensor(); + if (!subtensor.empty()) + subtensors.emplace_back("out_subtensor_" + std::to_string(i), subtensor2str(subtensor)); + + const auto& layout = desc->get_layout(); + if (!layout.empty() && !is_planar_layout(layout)) + layouts.emplace_back("out_layout_" + std::to_string(i), layout); + + out_reg_types.emplace_back(regTypeToStr(desc->get_reg().type)); + out_regs.emplace_back(desc->get_reg().idx); + } + + if (!in_regs.empty()) { + visitor.on_attribute("in_regs", in_regs); + visitor.on_attribute("in_reg_types", in_reg_types); + } + if (!out_regs.empty()) { + visitor.on_attribute("out_regs", out_regs); + visitor.on_attribute("out_reg_types", out_reg_types); + } + for (auto& s : shapes) + visitor.on_attribute(s.first, s.second); + for (auto& s : subtensors) + visitor.on_attribute(s.first, s.second); + for (auto& s : layouts) + visitor.on_attribute(s.first, s.second); + visitor.on_attribute("loop_ids", m_loop_ids); + visitor.on_attribute("execution_number", m_exec_num); + m_source_node->visit_attributes(visitor); + return true; } ExpressionPort Expression::get_input_port(size_t i) { diff --git a/src/common/snippets/src/lowered/expression_factory.cpp b/src/common/snippets/src/lowered/expression_factory.cpp index da60f9ac701b5f..668df3b65c415e 100644 --- a/src/common/snippets/src/lowered/expression_factory.cpp +++ b/src/common/snippets/src/lowered/expression_factory.cpp @@ -10,22 +10,29 @@ namespace ov { namespace snippets { namespace lowered { -void LinearIR::ExpressionFactory::create_expression_inputs(const LinearIR& linear_ir, const ExpressionPtr& expr) { - OPENVINO_ASSERT(expr != nullptr, "Failed expression inputs creation: expression is null"); - const auto& node = expr->get_node(); - - expr->m_input_port_connectors.resize(node->get_input_size(), nullptr); - for (const auto& input : node->inputs()) { - const auto input_source = input.get_source_output(); - const auto in_index = input.get_index(); - const auto& parent_expr = linear_ir.get_expr_by_node(input_source.get_node_shared_ptr()); - const auto& port_connector = parent_expr->get_output_port_connector(input_source.get_index()); - port_connector->add_consumer(expr->get_input_port(in_index)); - expr->m_input_port_connectors[in_index] = port_connector; +template<> +std::shared_ptr ExpressionFactory::build(const std::shared_ptr& n, const std::vector& inputs) { + if (const auto par = ov::as_type_ptr(n)) { + return create(par, inputs, m_shape_infer_factory); + } else if (const auto res = ov::as_type_ptr(n)) { + return create(res, inputs, m_shape_infer_factory); + } else if (const auto loop_begin = ov::as_type_ptr(n)) { + return create(loop_begin, inputs, m_shape_infer_factory); + } else if (const auto loop_end = ov::as_type_ptr(n)) { + return create(loop_end, inputs, m_shape_infer_factory); + } else if (const auto buffer = ov::as_type_ptr(n)) { + return create(buffer, inputs, m_shape_infer_factory); +#ifdef SNIPPETS_DEBUG_CAPS + } else if (const auto perf_counter = ov::as_type_ptr(n)) { + return create(perf_counter, inputs, m_shape_infer_factory); + } else if (const auto perf_counter = ov::as_type_ptr(n)) { + return create(perf_counter, inputs, m_shape_infer_factory); +#endif } + return create(n, inputs, m_shape_infer_factory); } -void LinearIR::ExpressionFactory::create_expression_outputs(const ExpressionPtr& expr) { +void ExpressionFactory::create_expression_outputs(const ExpressionPtr& expr) { OPENVINO_ASSERT(expr != nullptr, "Failed expression outputs creation: expression is null"); const auto& node = expr->get_node(); @@ -38,7 +45,7 @@ void LinearIR::ExpressionFactory::create_expression_outputs(const ExpressionPtr& } // The method verifies of input port connectors to availability of the expression as consumer and add it if missed -void LinearIR::ExpressionFactory::init_expression_inputs(const ExpressionPtr& expr, const std::vector& inputs) { +void ExpressionFactory::init_expression_inputs(const ExpressionPtr& expr, const std::vector& inputs) { for (size_t i = 0; i < inputs.size(); ++i) { const auto& input = inputs[i]; const auto consumers = input->get_consumers(); @@ -53,18 +60,21 @@ void LinearIR::ExpressionFactory::init_expression_inputs(const ExpressionPtr& ex expr->m_input_port_connectors = inputs; } -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& par, const LinearIR& linear_ir) { +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& par, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { + OPENVINO_ASSERT(inputs.empty(), "Parameter cannot have inputs"); // Note: ctor of shared_ptr isn't friend class for Expression -> we cannot use directly make_shared(args) - auto expr = std::shared_ptr(new Expression(par, linear_ir.m_shape_infer_factory, false)); + auto expr = std::shared_ptr(new Expression(par, shape_infer_factory, false)); create_expression_outputs(expr); expr->validate(); return expr; } -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& res, const LinearIR& linear_ir) { +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& res, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { // Note: ctor of shared_ptr isn't friend class for Expression -> we cannot use directly make_shared(args) - auto expr = std::shared_ptr(new Expression(res, linear_ir.m_shape_infer_factory)); - create_expression_inputs(linear_ir, expr); + auto expr = std::shared_ptr(new Expression(res, shape_infer_factory)); + init_expression_inputs(expr, inputs); // The Result node don't need output port (because of sense of the node). But each node in openvino must have one output at least. // The port descriptors are automatically created in constructor. We manually clean output ports. expr->m_output_port_descriptors.clear(); @@ -72,31 +82,19 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, const LinearIR& linear_ir) { - OPENVINO_ASSERT(!ov::is_type(n), "Default expression builder doesn't support LoopBegin and LoopEnd"); - // Note: ctor of shared_ptr isn't friend class for Expression - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory)); - create_expression_inputs(linear_ir, expr); - create_expression_outputs(expr); - expr->validate(); - return expr; -} - -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { OPENVINO_ASSERT(inputs.empty(), "LoopBegin cannot have inputs"); - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory, false)); + auto expr = std::shared_ptr(new Expression(n, shape_infer_factory, false)); init_expression_inputs(expr, inputs); create_expression_outputs(expr); expr->validate(); return expr; } -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory, false)); +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { + auto expr = std::shared_ptr(new Expression(n, shape_infer_factory, false)); expr->m_input_port_descriptors.resize(inputs.size(), nullptr); for (size_t i = 0; i < inputs.size() - 1; ++i) { expr->m_input_port_descriptors[i] = std::make_shared(); @@ -113,23 +111,22 @@ ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - OPENVINO_ASSERT(inputs.empty(), "PerfCountBegin factory do not accept any input connectors"); - return create_without_connections(n, linear_ir); +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& n, const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { + OPENVINO_ASSERT(inputs.empty(), "PerfCountBegin shape_infer_factory do not accept any input connectors"); + return create_without_connections(n, shape_infer_factory); } -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - OPENVINO_ASSERT(inputs.empty(), "PerfCountEnd factory do not accept any input connectors"); - return create_without_connections(n, linear_ir); +ExpressionPtr ExpressionFactory::create(const std::shared_ptr& n, + const std::vector& inputs, + const std::shared_ptr& shape_infer_factory) { + OPENVINO_ASSERT(inputs.empty(), "PerfCountEnd shape_infer_factory do not accept any input connectors"); + return create_without_connections(n, shape_infer_factory); } -ExpressionPtr LinearIR::ExpressionFactory::create_without_connections(const std::shared_ptr& n, - const LinearIR& linear_ir) { - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory, false)); +ExpressionPtr ExpressionFactory::create_without_connections(const std::shared_ptr& n, + const std::shared_ptr& shape_infer_factory) { + auto expr = std::shared_ptr(new Expression(n, shape_infer_factory, false)); expr->m_input_port_descriptors.clear(); expr->m_output_port_descriptors.clear(); expr->validate(); @@ -137,22 +134,6 @@ ExpressionPtr LinearIR::ExpressionFactory::create_without_connections(const std: } #endif -ExpressionPtr LinearIR::ExpressionFactory::create(const std::shared_ptr& n, - const std::vector& inputs, - const LinearIR& linear_ir) { - OPENVINO_ASSERT(!ov::is_type(n) && - !ov::is_type(n), - "Expression builder with inputs doesn't support Result and Parameter"); - auto expr = std::shared_ptr(new Expression(n, linear_ir.m_shape_infer_factory)); - init_expression_inputs(expr, inputs); - create_expression_outputs(expr); - expr->validate(); - // todo: here we blindly synchronize input shapes from parent and child. Remove this when shapes will be stored in - // port connector itself - if (linear_ir.m_shape_infer_factory) - expr->updateShapes(); - return expr; -} }// namespace lowered }// namespace snippets }// namespace ov diff --git a/src/common/snippets/src/lowered/expressions/buffer_expression.cpp b/src/common/snippets/src/lowered/expressions/buffer_expression.cpp new file mode 100644 index 00000000000000..acc742ff196407 --- /dev/null +++ b/src/common/snippets/src/lowered/expressions/buffer_expression.cpp @@ -0,0 +1,134 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + + +#include "snippets/lowered/expressions/buffer_expression.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/op/buffer.hpp" + + +namespace ov { +namespace snippets { +namespace lowered { + +BufferExpression::BufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory) + : Expression(n, factory) { + const auto& buffer = ov::as_type_ptr(get_node()); + OPENVINO_ASSERT(buffer, "BufferExpression expects Buffer op"); + m_allocation_size = buffer->get_allocation_size(); +} + +ExpressionPtr BufferExpression::clone() const { + return std::shared_ptr(new BufferExpression(*this)); +} + +bool BufferExpression::visit_attributes(AttributeVisitor &visitor) { + auto allocation_size = utils::value2str(m_allocation_size); + auto offset = utils::value2str(m_offset); + visitor.on_attribute("allocation_size", allocation_size); + visitor.on_attribute("offset", offset); + visitor.on_attribute("reg_group", m_reg_group); + visitor.on_attribute("cluster_id", m_cluster_id); + return true; +} + +bool BufferExpression::is_defined() const { + return !utils::is_dynamic_value(m_allocation_size); +} + +size_t BufferExpression::get_byte_size() const { + if (is_defined()) + return m_allocation_size * get_node()->get_output_element_type(0).size(); + return utils::get_dynamic_value(); +} + +namespace { +std::vector get_parent_inner_loops(const std::vector& parent_loops, const std::vector& current_loops) { + const auto common_rank = std::min(parent_loops.size(), current_loops.size()); + size_t i = 0; + while (i < common_rank && parent_loops[i] == current_loops[i]) + ++i; + return std::vector(parent_loops.cbegin() + i, parent_loops.cend()); +} +} // namespace + +// Ticket: 113744 +// TODO: This logic covers only several specific cases so it should be generalized. +void BufferExpression::init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) { + // Note: Buffer expressions can have more than one parent after the loops splitting transformation, but only the last parent + // can be used to access valid loop ports. More info in the ticket: 146646 + const auto buffer_in_idx = get_input_count() - 1; + const auto& parent_port = get_input_port_connector(buffer_in_idx)->get_source(); + const auto& parent_loop_ids = get_parent_inner_loops(parent_port.get_expr()->get_loop_ids(), get_loop_ids()); + const auto planar_shape = utils::get_preordered_vdims(parent_port); + + const size_t rank = allocation_rank >= 0 ? std::min(static_cast(allocation_rank), planar_shape.size()) + : planar_shape.size(); + + const auto& subtensor = ov::snippets::utils::get_projected_subtensor(parent_port); + + auto hard_equal = [&parent_port](const LoopPort& port) { + return *port.expr_port == parent_port; + }; + auto soft_equal = [&](const LoopPort& loop_port) { + const auto& port = *loop_port.expr_port; + // Check semantic of LoopPort + if (parent_port.get_index() != port.get_index() || + port.get_expr()->get_node()->get_type_info() != parent_port.get_expr()->get_node()->get_type_info()) + return false; + // Check that this LoopPort is connected to the same by semantic Buffer + const auto consumers = port.get_connected_ports(); + for (const auto& consumer : consumers) { + if (const auto buffer_consumer = ov::as_type_ptr(consumer.get_expr())) { + if (buffer_consumer->get_cluster_id() == m_cluster_id && consumer.get_index() == buffer_in_idx) + return true; + } + } + return false; + }; + + m_allocation_size = 1; + std::set processed_dim_idxs; + for (const auto& parent_loop : parent_loop_ids) { + const auto loop_info = loop_manager->get_loop_info(parent_loop); + const auto& output_ports = loop_info->get_output_ports(); + auto it = std::find_if(output_ports.begin(), output_ports.end(), hard_equal); + // [149219] : Try to find original loop port if this LoopInfo is cloned after InsertSpecificIterations + // and ports are not mapped on the original ExpressionPorts + if (it == output_ports.end()) { + it = std::find_if(output_ports.begin(), output_ports.end(), soft_equal); + OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found"); + } + const auto& loop_port = *it; + const auto& dim_idx = loop_port.dim_idx; + if (loop_port.is_incremented && dim_idx < rank) { + if (const auto& unified_loop_info = ov::as_type_ptr(loop_info)) + m_allocation_size = utils::dynamic_safe_mul(m_allocation_size, unified_loop_info->get_work_amount()); + else if (const auto& expanded_loop_info = ov::as_type_ptr(loop_info)) + m_allocation_size = utils::dynamic_safe_mul(m_allocation_size, expanded_loop_info->get_unified_loop_info()->get_work_amount()); + else + OPENVINO_THROW("Unknown LoopInfo type"); + processed_dim_idxs.insert(dim_idx); + } + } + const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size(); + for (size_t i = 0; i < std::min(processing_rank, rank); ++i) { + if (processed_dim_idxs.count(i) == 0) { + const auto multiplier = i < subtensor.size() ? *(subtensor.rbegin() + i) : *(planar_shape.rbegin() + i); + m_allocation_size = utils::dynamic_safe_mul(m_allocation_size, multiplier); + } + } + + // Corner case when the current information is not enough + if (processing_rank == 0 && processed_dim_idxs.empty()) { + for (size_t i = 0; i < rank; ++i) { + m_allocation_size = utils::dynamic_safe_mul(m_allocation_size, *(planar_shape.rbegin() + i)); + } + } +} + +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/lowered/linear_ir.cpp b/src/common/snippets/src/lowered/linear_ir.cpp index 09640196b1fa17..36ab2e235880af 100644 --- a/src/common/snippets/src/lowered/linear_ir.cpp +++ b/src/common/snippets/src/lowered/linear_ir.cpp @@ -25,7 +25,8 @@ LinearIR::LinearIR(Config config, const std::shared_ptr()), m_shape_infer_factory(factory), - m_shape_infer(std::make_shared(m_expressions, m_parameter_expressions, m_result_expressions)) {} + m_shape_infer(std::make_shared(m_expressions, m_parameter_expressions, m_result_expressions)), + m_expression_factory(std::make_shared(m_shape_infer_factory)) {} LinearIR::LinearIR(const std::shared_ptr& model, const std::shared_ptr& factory, @@ -34,7 +35,7 @@ LinearIR::LinearIR(const std::shared_ptr& model, constExprIt last_param = m_expressions.end(); for (const auto& n : get_ordered_ops(model)) { constExprIt insertion_pos = m_expressions.end(); - const auto expr = create_expression(n); + const auto expr = get_expr_factory()->build(n, get_expression_inputs_by_node(n)); // Scalar should be on the Linear IR beginning after Parameters to have valid expression order after Loop passes. // After these passes we must call pass MoveScalarToConsumer() to have a correct accuracy. @@ -43,7 +44,6 @@ LinearIR::LinearIR(const std::shared_ptr& model, insertion_pos = std::next(last_param); } - // exec_num = 0 since `insertion_pos` can be changed register_expression(expr, true, 0); const auto& it = m_expressions.insert(insertion_pos, expr); if (ov::is_type(n)) @@ -57,12 +57,21 @@ LinearIR::LinearIR(const std::shared_ptr& model, enumerate_expressions(); } -ExpressionPtr LinearIR::create_expression(const std::shared_ptr& n) { - return ExpressionFactory::build(n, *this); +const ExpressionFactoryPtr& LinearIR::get_expr_factory() const { + OPENVINO_ASSERT(m_expression_factory, "ExpresstionFactory is missed!"); + return m_expression_factory; } -ExpressionPtr LinearIR::create_expression(const std::shared_ptr& n, const std::vector& inputs) const { - return ExpressionFactory::build(n, inputs, *this); +std::vector LinearIR::get_expression_inputs_by_node(const std::shared_ptr& n) const { + OPENVINO_ASSERT(n != nullptr, "Failed expression inputs getting: node is null"); + std::vector inputs(n->get_input_size(), nullptr); + for (const auto& input : n->inputs()) { + const auto input_source = input.get_source_output(); + const auto in_index = input.get_index(); + const auto& parent_expr = get_expr_by_node(input_source.get_node_shared_ptr()); + inputs[in_index] = parent_expr->get_output_port_connector(input_source.get_index()); + } + return inputs; } namespace { @@ -84,7 +93,7 @@ void update_consumers_and_regs(const ExpressionPtr& new_expr, const std::vector< ExpressionPtr LinearIR::create_expression(const std::shared_ptr& n, const std::vector& new_inputs, const std::vector& loop_ids, bool update_loop_ports, const std::vector>& consumers) { - const auto new_expr = create_expression(n, new_inputs); + const auto new_expr = get_expr_factory()->build(n, new_inputs); update_consumers_and_regs(new_expr, consumers); new_expr->set_loop_ids(loop_ids); @@ -178,12 +187,13 @@ void LinearIR::register_expression(const ExpressionPtr& expr, bool io_allowed, d "LinearIR::insert can't be used to add Parameters or Results to IR"); const auto& res = m_node2expression_map.insert({node, expr}); OPENVINO_ASSERT(res.second, "Duplicate node is detected in linear IR: ", node); + if (ov::is_type(node)) m_parameter_expressions.push_back(expr); if (ov::is_type(node)) m_result_expressions.push_back(expr); - if (ov::is_type(node)) - m_buffer_expressions.push_back(expr); + if (const auto buffer_expr = ov::as_type_ptr(expr)) + m_buffer_expressions.push_back(buffer_expr); expr->m_exec_num = exec_num; } @@ -197,9 +207,9 @@ void LinearIR::unregister_expression(const ExpressionPtr& expr) { m_node2expression_map.erase(node); OPENVINO_ASSERT(!ov::is_type(node) && !ov::is_type(node), "unregister_expression mustn't be called for parameter or result expressions"); - if (ov::is_type(node)) { - const auto& it = std::find(m_buffer_expressions.cbegin(), m_buffer_expressions.cend(), expr); - OPENVINO_ASSERT(it != m_buffer_expressions.cend(), "Buffer Expression has not been found in the list of LinearIR Buffers!"); + if (const auto buffer_expr = ov::as_type_ptr(expr)) { + const auto& it = std::find(m_buffer_expressions.cbegin(), m_buffer_expressions.cend(), buffer_expr); + OPENVINO_ASSERT(it != m_buffer_expressions.cend(), "BufferExpression has not been found in the list of LinearIR Buffers!"); m_buffer_expressions.erase(it); } } @@ -245,7 +255,7 @@ LinearIR::exprIt LinearIR::insert(LinearIR::constExprIt pos, const NodeVector& n } LinearIR::exprIt LinearIR::insert(LinearIR::constExprIt pos, const std::shared_ptr& n) { - const auto& expr = create_expression(n); + const auto& expr = get_expr_factory()->build(n, get_expression_inputs_by_node(n)); register_expression(expr, m_config.m_manual_build_support, get_inserted_expr_exec_num(pos)); return m_expressions.insert(pos, expr); } @@ -338,6 +348,18 @@ LinearIR::exprIt LinearIR::insert_node(const std::shared_ptr& new_node return insert_node(new_node, new_inputs, loop_ids, update_loop_ports, place, consumers); } +LinearIR::exprIt LinearIR::insert_expr(const ExpressionPtr& new_expr, const std::vector& loop_ids, + bool update_loop_ports, const constExprIt& place, const std::vector>& consumers) { + update_consumers_and_regs(new_expr, consumers); + new_expr->set_loop_ids(loop_ids); + + const auto expr_it = insert(place, new_expr); + if (update_loop_ports) + get_loop_manager()->update_loop_ports(new_expr); + + return expr_it; +} + LinearIR::exprIt LinearIR::replace_with_node(const std::vector& old_exprs, const std::shared_ptr& new_node, const std::vector& loop_ids, const constExprIt& place) { OPENVINO_ASSERT(!old_exprs.empty(), "Failed to replace node: there are no old expressions for replacing"); @@ -473,11 +495,10 @@ double LinearIR::get_inserted_expr_exec_num(constExprIt insertion_pos) const { return left_order + (right_order - left_order) / 2; } -LinearIR::LIRShapeInfer::LIRShapeInfer(const container& body_exprs, const container& param_exprs, const container& result_exprs) - : ShapeInferSnippetsNode(), - m_exprs(body_exprs), - m_input_exprs(param_exprs), - m_output_exprs(result_exprs) { +LinearIR::LIRShapeInfer::LIRShapeInfer(const container& body_exprs, + const std::vector& param_exprs, + const std::vector& result_exprs) + : ShapeInferSnippetsNode(), m_exprs(body_exprs), m_input_exprs(param_exprs), m_output_exprs(result_exprs) { // Note that if all output shapes are static, as in the case when the first shape infer was performed on nGraph, // we can treat them as the last result std::vector outputDims; diff --git a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp index d72e35ceac533b..f76c4097b38f38 100644 --- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp @@ -28,7 +28,7 @@ bool AllocateBuffers::run(lowered::LinearIR& linear_ir, lowered::LinearIR::const size_t buffer_scratchpad_size = 0; PassPipeline pipeline; - pipeline.register_pass(linear_ir.get_config().m_loop_depth); + pipeline.register_pass(); if (m_is_optimized_mode) { pipeline.register_pass(); pipeline.register_pass(); diff --git a/src/common/snippets/src/lowered/pass/assign_registers.cpp b/src/common/snippets/src/lowered/pass/assign_registers.cpp index e071460e5d85f1..2f921214bffed4 100644 --- a/src/common/snippets/src/lowered/pass/assign_registers.cpp +++ b/src/common/snippets/src/lowered/pass/assign_registers.cpp @@ -84,25 +84,22 @@ bool AssignRegisters::run(LinearIR& linear_ir) { auto accumulator_reg = 0lu; for (const auto& expr : exprs) { auto op = expr->get_node(); - if (const auto& buffer = ov::as_type_ptr(op)) { - const auto reg_group = buffer->get_reg_group(); + if (const auto& buffer_expr = ov::as_type_ptr(expr)) { + const auto reg_group = buffer_expr->get_reg_group(); // All buffers have one common data pointer - if (ov::is_type(buffer)) { - const auto assigned_reg = num_results + num_parameters + reg_group; - for (const auto& input : expr->get_input_port_connectors()) { - manually_assigned_gprs[input] = static_cast(assigned_reg); - // shape infer ops in the middle of subgraph. IntermediateMemoryBuffer is inserted before reshape as new loop should start. - // child shape info ops share the same memory as IntermediateMemoryBuffer. - const auto& shape_infer_consumers = utils::get_first_child_shape_infer_expr_seq(expr); - for (const auto& child_shape_infer_expr : shape_infer_consumers) { - manually_assigned_gprs[child_shape_infer_expr->get_input_port_connector(0)] = - manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] = - static_cast(assigned_reg); - } + const auto assigned_reg = num_results + num_parameters + reg_group; + for (const auto& input : expr->get_input_port_connectors()) { + manually_assigned_gprs[input] = static_cast(assigned_reg); + // shape infer ops in the middle of subgraph. Buffer is inserted before reshape as new loop should start. + // child shape info ops share the same memory as Buffer. + const auto& shape_infer_consumers = utils::get_first_child_shape_infer_expr_seq(expr); + for (const auto& child_shape_infer_expr : shape_infer_consumers) { + manually_assigned_gprs[child_shape_infer_expr->get_input_port_connector(0)] = + manually_assigned_gprs[child_shape_infer_expr->get_output_port_connector(0)] = + static_cast(assigned_reg); } } - manually_assigned_gprs[expr->get_output_port_connector(0)] = - static_cast(num_results + num_parameters + reg_group); + manually_assigned_gprs[expr->get_output_port_connector(0)] = static_cast(assigned_reg); } else if (ov::is_type(op) || ov::is_type(op)) { // Only in ReduceDecomposition Reduce ops use HorizonMax/HorizonSum and VectorBuffer. // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator diff --git a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp index 4cf201047d63f5..e0397b03224bc3 100644 --- a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp +++ b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp @@ -32,10 +32,10 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop std::set read_data_exprs; for (size_t i = 0; i < input_count; ++i) { const auto& parent_output = loop_connectors[i]->get_source().get_expr(); - if (const auto buffer = ov::as_type_ptr(parent_output->get_node())) { + if (const auto buffer_expr = ov::as_type_ptr(parent_output)) { // If Buffer is missed in set, Just save - it's first meeting - if (buffers_groups.count(buffer->get_reg_group()) == 0) { - buffers_groups.insert(buffer->get_reg_group()); + if (buffers_groups.count(buffer_expr->get_reg_group()) == 0) { + buffers_groups.insert(buffer_expr->get_reg_group()); } else { // The Buffer with the same ID is in set - need to add this Buffer idx to set of Buffers for resetting resetting_data_indexes.insert(i); @@ -56,17 +56,17 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop size_t buffer_count = 0; size_t loop_count = 0; for (const auto& consumer_input : consumer_inputs) { - const auto& child_node = consumer_input.get_expr()->get_node(); - if (const auto buffer = ov::as_type_ptr(child_node)) { + const auto& consumer = consumer_input.get_expr(); + if (const auto buffer_expr = ov::as_type_ptr(consumer)) { buffer_count++; // If Buffer is missed in set, Just save - it's first meeting - if (buffers_groups.count(buffer->get_reg_group()) == 0) { - buffers_groups.insert(buffer->get_reg_group()); + if (buffers_groups.count(buffer_expr->get_reg_group()) == 0) { + buffers_groups.insert(buffer_expr->get_reg_group()); } else { // The Buffer with the same ID is in set - need to add this Buffer idx to set of Buffers for resetting resetting_data_indexes.insert(input_count + i); } - } else if (ov::is_type(child_node)) { + } else if (ov::is_type(consumer->get_node())) { loop_count++; } } diff --git a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp index 85bbed324a9865..c6f0b9bcb936cb 100644 --- a/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp +++ b/src/common/snippets/src/lowered/pass/compute_buffer_allocation_size.cpp @@ -14,112 +14,16 @@ namespace snippets { namespace lowered { namespace pass { -namespace { -std::vector get_parent_inner_loops(const std::vector& parent_loops, const std::vector& current_loops) { - const auto common_rank = std::min(parent_loops.size(), current_loops.size()); - size_t i = 0; - while (i < common_rank && parent_loops[i] == current_loops[i]) - ++i; - return std::vector(parent_loops.cbegin() + i, parent_loops.cend()); -} -} // namespace - -// Ticket: 113744 -// TODO: This logic covers only several specific cases so it should be generalized. -size_t ComputeBufferAllocationSize::get_allocation_size(const LoopManagerPtr& loop_manager, const ExpressionPtr& buffer_expr, size_t allocation_rank) { - const auto& current_buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(current_buffer, "`get_allocation_size` expected Buffer"); - - // Note: Buffer expressions can have more than one parent after the loops splitting transformation, but only the last parent - // can be used to access valid loop ports. More info in the ticket: 146646 - const auto buffer_in_idx = buffer_expr->get_input_count() - 1; - const auto& parent_port = buffer_expr->get_input_port_connector(buffer_in_idx)->get_source(); - const auto& parent_loop_ids = get_parent_inner_loops(parent_port.get_expr()->get_loop_ids(), buffer_expr->get_loop_ids()); - const auto planar_shape = utils::get_preordered_vdims(parent_port); - - const size_t rank = allocation_rank >= 0 ? std::min(static_cast(allocation_rank), planar_shape.size()) - : planar_shape.size(); - - const auto& subtensor = ov::snippets::utils::get_projected_subtensor(parent_port); - - auto hard_equal = [&parent_port](const LoopPort& port) { - return *port.expr_port == parent_port; - }; - auto soft_equal = [&](const LoopPort& loop_port) { - const auto& port = *loop_port.expr_port; - // Check semantic of LoopPort - if (parent_port.get_index() != port.get_index() || - port.get_expr()->get_node()->get_type_info() != parent_port.get_expr()->get_node()->get_type_info()) - return false; - // Check that this LoopPort is connected to the same by semantic Buffer - const auto consumers = port.get_connected_ports(); - for (const auto& consumer : consumers) { - if (const auto buffer_consumer = ov::as_type_ptr(consumer.get_expr()->get_node())) { - if (buffer_consumer->get_cluster_id() == current_buffer->get_cluster_id() && consumer.get_index() == buffer_in_idx) - return true; - } - } - return false; - }; - - size_t allocation_size = 1; - std::set processed_dim_idxs; - for (const auto& parent_loop : parent_loop_ids) { - const auto loop_info = loop_manager->get_loop_info(parent_loop); - const auto& output_ports = loop_info->get_output_ports(); - auto it = std::find_if(output_ports.begin(), output_ports.end(), hard_equal); - // [149219] : Try to find original loop port if this LoopInfo is cloned after InsertSpecificIterations - // and ports are not mapped on the original ExpressionPorts - if (it == output_ports.end()) { - it = std::find_if(output_ports.begin(), output_ports.end(), soft_equal); - OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found"); - } - const auto& loop_port = *it; - const auto& dim_idx = loop_port.dim_idx; - if (loop_port.is_incremented && dim_idx < rank) { - if (const auto& unified_loop_info = ov::as_type_ptr(loop_info)) - allocation_size = utils::dynamic_safe_mul(allocation_size, unified_loop_info->get_work_amount()); - else if (const auto& expanded_loop_info = ov::as_type_ptr(loop_info)) - allocation_size = utils::dynamic_safe_mul(allocation_size, expanded_loop_info->get_unified_loop_info()->get_work_amount()); - else - OPENVINO_THROW("Unknown LoopInfo type"); - processed_dim_idxs.insert(dim_idx); - } - } - const auto processing_rank = !processed_dim_idxs.empty() ? std::max(*processed_dim_idxs.rbegin(), subtensor.size()) : subtensor.size(); - for (size_t i = 0; i < std::min(processing_rank, rank); ++i) { - if (processed_dim_idxs.count(i) == 0) { - const auto multiplier = i < subtensor.size() ? *(subtensor.rbegin() + i) : *(planar_shape.rbegin() + i); - allocation_size = utils::dynamic_safe_mul(allocation_size, multiplier); - } - } - - // Corner case when the current information is not enough - if (processing_rank == 0 && processed_dim_idxs.empty()) { - for (size_t i = 0; i < rank; ++i) { - allocation_size = utils::dynamic_safe_mul(allocation_size, *(planar_shape.rbegin() + i)); - } - } - - return allocation_size; -} - bool ComputeBufferAllocationSize::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ComputeBufferAllocationSize") + const auto& allocation_rank = linear_ir.get_config().m_loop_depth; const auto& loop_manager = linear_ir.get_loop_manager(); - - const auto& buffer_expressions = linear_ir.get_buffers(); - for (const auto& buffer_expr : buffer_expressions) { - const auto node = buffer_expr->get_node(); - if (const auto buffer = ov::as_type_ptr(node)) { - // If the current size is undefined, update it - // TODO [143395] : MemoryManager will return container with only dynamic buffers without any `is_defined()` - if (!buffer->is_defined()) - buffer->set_allocation_size(get_allocation_size(loop_manager, buffer_expr, m_buffer_allocation_rank)); - } else { - OPENVINO_ASSERT(ov::is_type(node), "Expected Buffer ops in Buffer expressions of LinearIR"); - } + for (const auto& buffer_expr : linear_ir.get_buffers()) { + // If the current size is undefined, update it + // TODO [143395] : MemoryManager will return container with only dynamic buffers without any `is_defined()` + if (!buffer_expr->is_defined()) + buffer_expr->init_allocation_size(loop_manager, allocation_rank); } return true; diff --git a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp index f3e065173baf9d..c43b5d63a358c6 100644 --- a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp +++ b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp @@ -16,17 +16,16 @@ namespace pass { using ShiftPtrParams = SetBufferRegGroup::ShiftPtrParams; -DefineBufferClusters::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const ExpressionPtr& target) { +DefineBufferClusters::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const BufferExpressionPtr& target) { return std::find_if(m_clusters.begin(), m_clusters.end(), [&target](const BufferCluster& cluster) { return cluster.count(target) > 0; }); } -bool DefineBufferClusters::is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const { - const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - return buffer && buffer_expr->get_loop_ids() == target_expr->get_loop_ids(); +bool DefineBufferClusters::is_direct_buffer(const BufferExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const { + return buffer_expr && buffer_expr->get_loop_ids() == target_expr->get_loop_ids(); } -void DefineBufferClusters::create_new_cluster(const ExpressionPtr& buffer_expr) { +void DefineBufferClusters::create_new_cluster(const BufferExpressionPtr& buffer_expr) { const auto cluster_it = find_cluster_by_expr(buffer_expr); // If Buffer is missed in clusters, create new cluster with the single Buffer node inside if (cluster_it == m_clusters.cend()) { @@ -36,9 +35,8 @@ void DefineBufferClusters::create_new_cluster(const ExpressionPtr& buffer_expr) size_t DefineBufferClusters::get_cluster_buffer_id(const BufferCluster& cluster) const { OPENVINO_ASSERT(!cluster.empty(), "Buffer cluster is empty!"); - const auto id = (ov::as_type_ptr(cluster.cbegin()->get()->get_node()))->get_reg_group(); - if (std::all_of(cluster.cbegin(), cluster.cend(), - [&id](const ExpressionPtr& expr) { return (ov::as_type_ptr(expr->get_node()))->get_reg_group() == id; })) { + const auto id = cluster.cbegin()->get()->get_reg_group(); + if (std::all_of(cluster.cbegin(), cluster.cend(), [&id](const BufferExpressionPtr& expr) { return expr->get_reg_group() == id; })) { return id; } return SIZE_MAX; @@ -53,7 +51,7 @@ DefineBufferClusters::BufferPorts DefineBufferClusters::get_input_buffers(const // Input Buffers for (size_t i = 0; i < in_count; ++i) { - const auto source_expr = connectors[i]->get_source().get_expr(); + const auto& source_expr = ov::as_type_ptr(connectors[i]->get_source().get_expr()); if (!is_direct_buffer(source_expr, loop_expr)) continue; // Save as input Buffer @@ -74,7 +72,7 @@ DefineBufferClusters::BufferPorts DefineBufferClusters::get_output_buffers(const for (size_t i = in_count; i < in_count + out_count; ++i) { for (const auto& consumer : connectors[i]->get_consumers()) { - auto consumer_expr = consumer.get_expr(); + const auto& consumer_expr = ov::as_type_ptr(consumer.get_expr()); if (!is_direct_buffer(consumer_expr, loop_expr)) continue; // Save as output Buffer @@ -102,7 +100,6 @@ void DefineBufferClusters::parse_loop(const LinearIR::constExprIt& expr_it) { for (const auto& out : output_buffers) { const auto output_buffer_expr = out.first; const auto output_buffer_port_idx = *(out.second.cbegin()); // Output port is always one - const auto output_buffer = ov::as_type_ptr(output_buffer_expr->get_node()); bool has_been_added = false; for (const auto& in : input_buffers) { @@ -110,17 +107,15 @@ void DefineBufferClusters::parse_loop(const LinearIR::constExprIt& expr_it) { if (visited_buffers.count(input_buffer_expr) > 0) continue; - const auto input_buffer = ov::as_type_ptr(input_buffer_expr->get_node()); - // If allocated sizes of buffers are unkown on compilation stage (dynamic), // we cannot be sure that they're will be the same in runtime. - if (!input_buffer->is_defined()|| !output_buffer->is_defined()) + if (!input_buffer_expr->is_defined()|| !output_buffer_expr->is_defined()) continue; // Memory can be reused if reading and writing are executed proportionally: // - the same reading/writing order // - the same buffer memory sizes - if ((input_buffer->get_byte_size() != output_buffer->get_byte_size()) || + if ((input_buffer_expr->get_byte_size() != output_buffer_expr->get_byte_size()) || (input_buffer_expr->get_output_port_descriptor(0)->get_layout() != output_buffer_expr->get_input_port_descriptor(0)->get_layout())) continue; @@ -184,13 +179,13 @@ void DefineBufferClusters::parse_nested_loops(const BufferPorts& input_buffers, for (auto it = std::reverse_iterator(outer_loop_end_expr_it); (*it)->get_node() != outer_loop_begin; ++it) { const auto& inner_expr = *it; - if (const auto inner_buffer = ov::as_type_ptr(inner_expr->get_node())) { - const auto inner_cluster_it = find_cluster_by_expr(inner_expr); + if (const auto inner_buffer_expr = ov::as_type_ptr(inner_expr)) { + const auto inner_cluster_it = find_cluster_by_expr(inner_buffer_expr); OPENVINO_ASSERT(inner_cluster_it != m_clusters.cend(), "Buffer cluster has not been found"); const auto inner_cluster_id = get_cluster_buffer_id(*inner_cluster_it); if (inner_cluster_id == SIZE_MAX) continue; - const auto final_offset = get_buffer_finalization_offset(inner_expr); + const auto final_offset = get_buffer_finalization_offset(inner_buffer_expr); auto unite = [&](const BufferPorts& ports, const bool is_input) { bool applied = false; @@ -200,13 +195,13 @@ void DefineBufferClusters::parse_nested_loops(const BufferPorts& input_buffers, // If the buffers are already in the same cluster or have different Buffer ID - skip if (cluster_it == inner_cluster_it) continue; // Buffer from one cluster must be only defined (with known allocation_size) or dynamic (with unknown allocation_size) - if (inner_buffer->is_defined() != ov::as_type_ptr(port.first->get_node())->is_defined()) continue; + if (inner_buffer_expr->is_defined() != port.first->is_defined()) continue; bool can_be_reused = true; for (const auto idx : port.second) { can_be_reused = can_be_reused && can_be_data_ptr_proportionally_shifted(outer_ptr_increments[idx], outer_data_sizes[idx], - final_offset, inner_buffer->get_element_type().size()); + final_offset, inner_buffer_expr->get_node()->get_element_type().size()); } if (!can_be_reused) continue; @@ -223,7 +218,7 @@ void DefineBufferClusters::parse_nested_loops(const BufferPorts& input_buffers, } } -int64_t DefineBufferClusters::get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const { +int64_t DefineBufferClusters::get_buffer_finalization_offset(const BufferExpressionPtr& buffer_expr) const { auto index = [](const std::vector& loop_inputs, const PortConnectorPtr& buffer_out) { const auto it = std::find(loop_inputs.cbegin(), loop_inputs.cend(), buffer_out); OPENVINO_ASSERT(it != loop_inputs.cend(), "Buffer output PortConnector has not been found in target LoopEnd inputs"); @@ -252,7 +247,7 @@ int64_t DefineBufferClusters::get_buffer_finalization_offset(const ExpressionPtr bool DefineBufferClusters::unite_nested_clusters(const BufferClusters::iterator& inner_cluster_it, BufferCluster& outer_cluster, - const ExpressionPtr& outer_buffer, bool is_outer_up) { + const BufferExpressionPtr& outer_buffer, bool is_outer_up) { for (const auto& inner_buffer : *inner_cluster_it) { ExpressionPtr common_loop_end_expr = nullptr; size_t outer_idx = SIZE_MAX, inner_idx = SIZE_MAX; @@ -267,9 +262,8 @@ bool DefineBufferClusters::unite_nested_clusters(const BufferClusters::iterator& const auto& inner_data_sizes = common_loop_end->get_element_type_sizes(); if (SetBufferRegGroup::can_be_in_one_group({ inner_data_sizes[up_idx], inner_ptr_increments[up_idx], inner_final_offsets[up_idx] }, { inner_data_sizes[down_idx], inner_ptr_increments[down_idx], inner_final_offsets[down_idx] })) { - const auto buffer_reg_group = ov::as_type_ptr(outer_buffer->get_node())->get_reg_group(); for (const auto& inner_buffer : *inner_cluster_it) - ov::as_type_ptr(inner_buffer->get_node())->set_reg_group(buffer_reg_group); + inner_buffer->set_reg_group(outer_buffer->get_reg_group()); outer_cluster.insert(inner_cluster_it->cbegin(), inner_cluster_it->cend()); m_clusters.erase(inner_cluster_it); @@ -280,7 +274,8 @@ bool DefineBufferClusters::unite_nested_clusters(const BufferClusters::iterator& return false; } -bool DefineBufferClusters::are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx) { +bool DefineBufferClusters::are_buffer_neighbours(const BufferExpressionPtr& up, const BufferExpressionPtr& down, ExpressionPtr& loop, + size_t& up_idx, size_t& down_idx) { auto find_input = [&down](const PortConnectorPtr& in) { return in->get_source().get_expr() == down; }; @@ -323,15 +318,15 @@ void DefineBufferClusters::parse_memory_access_op(const ExpressionPtr& expr) { // TODO: Some full MemoryAccess ops can have inplace inputs and outputs in general. // Need to add mechanism of inplace ports using MemoryAccess::PortDescriptor::inplace for (const auto& input : expr->get_input_port_connectors()) { - if (is_direct_buffer(input->get_source().get_expr(), expr)) { - create_new_cluster(input->get_source().get_expr()); - } + const auto& buffer_expr = ov::as_type_ptr(input->get_source().get_expr()); + if (is_direct_buffer(buffer_expr, expr)) + create_new_cluster(buffer_expr); } for (const auto& output : expr->get_output_port_connectors()) { for (const auto& consumer : output->get_consumers()) { - if (is_direct_buffer(consumer.get_expr(), expr)) { - create_new_cluster(consumer.get_expr()); - } + const auto& buffer_expr = ov::as_type_ptr(consumer.get_expr()); + if (is_direct_buffer(buffer_expr, expr)) + create_new_cluster(buffer_expr); } } } @@ -357,10 +352,8 @@ bool DefineBufferClusters::run(lowered::LinearIR& linear_ir, lowered::LinearIR:: for (size_t cluster_id = 0; cluster_id < m_clusters.size(); ++cluster_id) { const auto& cluster = m_clusters[cluster_id]; - std::for_each(cluster.cbegin(), cluster.cend(), [&cluster_id](const ExpressionPtr& buffer_expr) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - buffer->set_cluster_id(cluster_id); + std::for_each(cluster.cbegin(), cluster.cend(), [&cluster_id](const BufferExpressionPtr& buffer_expr) { + buffer_expr->set_cluster_id(cluster_id); }); } diff --git a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp index e48f833380e5e3..90a7ddf0b3d21c 100644 --- a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp +++ b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp @@ -18,21 +18,17 @@ bool InitBuffersDefault::run(lowered::LinearIR& linear_ir, lowered::LinearIR::co size_t idx = 0; size_t offset = 0; - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto& expr = *expr_it; - const auto op = expr->get_node(); - if (const auto buffer = ov::as_type_ptr(op)) { - buffer->set_reg_group(idx); - buffer->set_cluster_id(idx); - - if (!buffer->is_defined()) { - buffer->set_offset(utils::get_dynamic_value()); - } else { - buffer->set_offset(offset); - offset += buffer->get_byte_size(); - } - idx++; + for (const auto& buffer_expr : linear_ir.get_buffers()) { + buffer_expr->set_reg_group(idx); + buffer_expr->set_cluster_id(idx); + + if (!buffer_expr->is_defined()) { + buffer_expr->set_offset(utils::get_dynamic_value()); + } else { + buffer_expr->set_offset(offset); + offset += buffer_expr->get_byte_size(); } + idx++; } m_buffer_scratchpad_size = offset; diff --git a/src/common/snippets/src/lowered/pass/init_loops.cpp b/src/common/snippets/src/lowered/pass/init_loops.cpp index cd7e2b55b7478f..aa7d0ab042e1a4 100644 --- a/src/common/snippets/src/lowered/pass/init_loops.cpp +++ b/src/common/snippets/src/lowered/pass/init_loops.cpp @@ -25,11 +25,11 @@ inline void init_is_incremented(LoopPort& port, size_t loop_id) { // Note: LoopPort connected to Buffer between two loops should not be incremented in the outermost loop // Consider the example below: // Store; Loop ids [0,1,2,3] - // IntermediateMemoryBuffer; Loop ids [0,1] + // Buffer; Loop ids [0,1] // Load; Loop ids [0,1,4,5] // Store is output port of Loop-1, but it should be incremented only in Loop-2 and Loop-3. Similar with Load. auto is_ignored = [=](const ExpressionPtr& target_expr) { - if (ov::is_type(target_expr->get_node())) { + if (ov::is_type(target_expr)) { const auto& target_loops = target_expr->get_loop_ids(); const auto i_max = std::min(expr_loops.size(), target_loops.size()); for (size_t i = 0; i < i_max && expr_loops[i] == target_loops[i]; i++) { diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index c6b5c3960e025b..fabb6573ab3b14 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -115,7 +115,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir, // Current expr Loop identifies: 3, 4, 6 // Need to insert between 2nd and 4th Loops - after 2nd Loop const auto pos = insertion_position(linear_ir, loop_manager, parent_expr, expr); - const auto buffer = std::make_shared(parent->output(parent_port)); + const auto buffer = std::make_shared(parent->output(parent_port)); const auto buffer_consumer = has_shape_infer_parent ? top_shape_infer_expr->get_input_port(0) : *entry_port; linear_ir.insert_node(buffer, std::vector{ parent_expr_output }, buffer_loop_ids, false, pos, { buffer_consumer }); } @@ -191,7 +191,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir, // Note: All potential consumers must have the same count of first equal Loop identifies and the same count of different last identifies const auto pos = insertion_position(linear_ir, loop_manager, expr, consumer_expr); - auto buffer = std::make_shared(node->output(port_idx)); + auto buffer = std::make_shared(node->output(port_idx)); // We cannot insert Node output connector on Buffer output because not all consumers of Node needs Buffer // Example: // Add diff --git a/src/common/snippets/src/lowered/pass/insert_load_store.cpp b/src/common/snippets/src/lowered/pass/insert_load_store.cpp index 231c783849908d..1885738eeb04b3 100644 --- a/src/common/snippets/src/lowered/pass/insert_load_store.cpp +++ b/src/common/snippets/src/lowered/pass/insert_load_store.cpp @@ -76,9 +76,9 @@ bool InsertLoadStore::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt be modified |= insert_load(linear_ir, expr_it); } else if (ov::is_type(node)) { modified |= insert_store(linear_ir, expr_it); - } else if (ov::is_type(node)) { + } else if (ov::is_type(expr)) { modified |= insert_load(linear_ir, expr_it); - if (ov::is_type(node)) + if (expr->get_input_count() > 0) modified |= insert_store(linear_ir, expr_it); } } diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp index badf4b0477759c..1e99f8c845161f 100644 --- a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp +++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp @@ -32,15 +32,19 @@ void connect_cloned_body_with_buffers_outside(LinearIR::constExprIt cur_begin, L const auto& consumers = original_expr->get_output_port_connector(i)->get_consumers(); for (const auto& consumer : consumers) { const auto consumer_expr = consumer.get_expr(); - const auto buffer = ov::as_type_ptr(consumer_expr->get_node()); - if (buffer && std::find(cur_begin, cur_end, consumer.get_expr()) == cur_end) { - OutputVector new_inputs = {result_expr->get_node()->output(i)}; - for (const auto& input : consumer_expr->get_input_port_connectors()) { - const auto& source = input->get_source(); - new_inputs.push_back(source.get_expr()->get_node()->output(source.get_index())); + const auto buffer_expr = ov::as_type_ptr(consumer_expr); + if (buffer_expr && std::find(cur_begin, cur_end, consumer.get_expr()) == cur_end) { + std::vector new_descs = {buffer_expr->get_input_port_descriptor(consumer.get_index())->clone()}; + std::vector new_inputs = {result_expr->get_output_port_connector(i)}; + OutputVector new_op_inputs = {result_expr->get_node()->output(i)}; + for (size_t j = 0; j < buffer_expr->get_input_count(); ++j) { + const auto& source = buffer_expr->get_input_port_connector(j)->get_source(); + new_op_inputs.push_back(source.get_expr()->get_node()->output(source.get_index())); + new_descs.push_back(buffer_expr->get_input_port_descriptor(j)->clone()); + new_inputs.push_back(buffer_expr->get_input_port_connector(j)); } - const auto new_buffer = buffer->clone_with_new_inputs(new_inputs); - linear_ir.replace_with_node({consumer_expr}, new_buffer); + const auto new_buffer_op = buffer_expr->get_node()->clone_with_new_inputs(new_op_inputs); + linear_ir.replace_with_expr({consumer_expr}, buffer_expr->clone_with_new_inputs(new_buffer_op, new_inputs, new_descs)); break; } } diff --git a/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp b/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp index 3e235749ce7ca2..3431a198f90dc6 100644 --- a/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp +++ b/src/common/snippets/src/lowered/pass/normalize_buffer_reg_groups.cpp @@ -18,17 +18,13 @@ bool NormalizeBufferRegisterGroups::run(lowered::LinearIR& linear_ir, lowered::L // [ original Buffer reg group -> normalized ] std::map buffer_reg_groups; - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto& expr = *expr_it; - const auto op = expr->get_node(); - if (const auto buffer = ov::as_type_ptr(op)) { - const auto group = buffer->get_reg_group(); - if (buffer_reg_groups.count(group) == 0) { - const auto new_id = buffer_reg_groups.size(); - buffer_reg_groups[group] = new_id; - } - buffer->set_reg_group(buffer_reg_groups[group]); + for (const auto& buffer_expr : linear_ir.get_buffers()) { + const auto group = buffer_expr->get_reg_group(); + if (buffer_reg_groups.count(group) == 0) { + const auto new_id = buffer_reg_groups.size(); + buffer_reg_groups[group] = new_id; } + buffer_expr->set_reg_group(buffer_reg_groups[group]); } return buffer_reg_groups.size(); } diff --git a/src/common/snippets/src/lowered/pass/propagate_buffer_offset.cpp b/src/common/snippets/src/lowered/pass/propagate_buffer_offset.cpp index abab05700c2344..4e7d17cf284f89 100644 --- a/src/common/snippets/src/lowered/pass/propagate_buffer_offset.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_buffer_offset.cpp @@ -17,28 +17,24 @@ namespace lowered { namespace pass { -void PropagateBufferOffset::propagate(const ExpressionPtr& buffer_expr) { +void PropagateBufferOffset::propagate(const BufferExpressionPtr& buffer_expr) { // If Buffer has offset We set this offset in the connected MemoryAccess ops // to correctly read and write data because all Buffers have the common data pointer on buffer scratchpad - const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Failed to propagate Buffer offset: PropagateBufferOffset expects Buffer op"); - const auto offset = buffer->get_offset(); + const auto offset = buffer_expr->get_offset(); // Propagate to up: in Store. Buffer can have only one Store - if (ov::is_type(buffer)) { - for (const auto& input : buffer_expr->get_input_port_connectors()) { - const auto& parent_output = input->get_source(); - const auto& parent_expr = parent_output.get_expr(); - const auto port = parent_output.get_index(); - const auto& parent_node = parent_expr->get_node(); - auto memory_access = std::dynamic_pointer_cast(parent_node); - if (memory_access && memory_access->is_memory_access_output_port(port)) { - memory_access->set_output_offset(offset, port); - } else { - OPENVINO_THROW( - "PropagateBufferOffset didn't find the connected MemoryAccess op to Buffer for offset propagation"); - } + for (const auto& input : buffer_expr->get_input_port_connectors()) { + const auto& parent_output = input->get_source(); + const auto& parent_expr = parent_output.get_expr(); + const auto port = parent_output.get_index(); + const auto& parent_node = parent_expr->get_node(); + auto memory_access = std::dynamic_pointer_cast(parent_node); + if (memory_access && memory_access->is_memory_access_output_port(port)) { + memory_access->set_output_offset(offset, port); + } else { + OPENVINO_THROW( + "PropagateBufferOffset didn't find the connected MemoryAccess op to Buffer for offset propagation"); } } // Propagate to down: in Load. Buffer can have several Load @@ -65,10 +61,8 @@ void PropagateBufferOffset::propagate(const ExpressionPtr& buffer_expr) { bool PropagateBufferOffset::run(lowered::LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::PropagateBufferOffset"); - const auto& buffer_expressions = linear_ir.get_buffers(); - for (const auto& buffer_expr : buffer_expressions) { + for (const auto& buffer_expr : linear_ir.get_buffers()) propagate(buffer_expr); - } return true; } diff --git a/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp index 59c9bf21a0894a..9bdb5e8ef3a9dc 100644 --- a/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp +++ b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp @@ -28,7 +28,7 @@ bool operator!=(const SetBufferRegGroup::ShiftPtrParams& lhs, const SetBufferReg return !(rhs == lhs); } -size_t SetBufferRegGroup::get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool) { +size_t SetBufferRegGroup::get_buffer_idx(const BufferExpressionPtr& target, const BufferPool& pool) { const auto iter = std::find(pool.cbegin(), pool.cend(), target); OPENVINO_ASSERT(iter != pool.cend(), "Buffer wasn't find in Buffer system of Subgraph"); return std::distance(pool.cbegin(), iter); @@ -44,8 +44,8 @@ bool SetBufferRegGroup::can_be_in_one_group(const ShiftPtrParams& lhs, const Shi return are_static && equal_ptr_params_shifting && (equal_element_type_sizes || (lhs.ptr_increment == 0 && lhs.finalization_offset == 0)); } -bool SetBufferRegGroup::are_adjacent(const std::pair& lhs, - const std::pair& rhs) { +bool SetBufferRegGroup::are_adjacent(const std::pair& lhs, + const std::pair& rhs) { const auto& lhs_ids = lhs.first->get_loop_ids(); const auto& rhs_ids = rhs.first->get_loop_ids(); const auto equal_loop_ids = lhs_ids == rhs_ids; @@ -64,10 +64,10 @@ bool SetBufferRegGroup::are_adjacent(const std::pair& lhs, - const std::pair& rhs, - const BufferPool& buffers, - std::vector& adj) { +void SetBufferRegGroup::update_adj_matrix(const std::pair& lhs, + const std::pair& rhs, + const BufferPool& buffers, + std::vector& adj) { const auto size = buffers.size(); const auto lhs_idx = get_buffer_idx(lhs.first, buffers); const auto rhs_idx = get_buffer_idx(rhs.first, buffers); @@ -125,14 +125,14 @@ SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_neighbours(const BufferMap buffer_neighbours; for (size_t i = 0; i < input_count; ++i) { const auto& parent_output = loop_end_expr->get_input_port_connector(i)->get_source().get_expr(); - if (ov::is_type(parent_output->get_node())) { - if (buffer_neighbours.count(parent_output) > 0) { - OPENVINO_ASSERT(buffer_neighbours[parent_output].ptr_increment == ptr_increments[i] && - buffer_neighbours[parent_output].finalization_offset == finalization_offsets[i], + if (const auto buffer_expr = ov::as_type_ptr(parent_output)) { + if (buffer_neighbours.count(buffer_expr) > 0) { + OPENVINO_ASSERT(buffer_neighbours[buffer_expr].ptr_increment == ptr_increments[i] && + buffer_neighbours[buffer_expr].finalization_offset == finalization_offsets[i], "Invalid data pointer shifts: If Buffer has several consumers, this consumers must have the same shifts or zero"); continue; } - buffer_neighbours[parent_output] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] }; + buffer_neighbours[buffer_expr] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] }; } } for (size_t i = input_count; i < input_count + output_count; ++i) { @@ -142,8 +142,8 @@ SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_neighbours(const size_t loop_count = 0; for (const auto& consumer_input : consumer_inputs) { const auto& child_expr = consumer_input.get_expr(); - if (ov::is_type(child_expr->get_node())) { - buffer_neighbours[child_expr] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] }; + if (const auto buffer_expr = ov::as_type_ptr(child_expr)) { + buffer_neighbours[buffer_expr] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] }; buffer_count++; } else if (ov::is_type(child_expr->get_node())) { loop_count++; @@ -163,10 +163,10 @@ SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_inside(const Lin BufferMap inner_buffers; for (auto it = std::reverse_iterator(loop_end_it); (*it)->get_node() != loop_begin; ++it) { const auto& inner_expr = *it; - if (ov::is_type(inner_expr->get_node())) { + if (const auto buffer_expr = ov::as_type_ptr(inner_expr)) { // Set default zero values since it's not used for adjacency definition in case with Buffers in Loop - if (inner_buffers.count(inner_expr) == 0) - inner_buffers[inner_expr] = { 0, 0, 0 }; + if (inner_buffers.count(buffer_expr) == 0) + inner_buffers[buffer_expr] = { 0, 0, 0 }; } } return inner_buffers; @@ -176,7 +176,7 @@ auto SetBufferRegGroup::coloring(BufferPool& buffers, std::vector& adj) -> size_t color = 0; std::map color_groups; const auto size = buffers.size(); - for (size_t i = 0; i < size; i++) { + for (size_t i = 0; i < size; ++i) { // The Buffer is already colored (visited) - skip if (!buffers[i]) continue; @@ -186,7 +186,7 @@ auto SetBufferRegGroup::coloring(BufferPool& buffers, std::vector& adj) -> buffers[i] = nullptr; // Remove from graph vertices // While Buffer `i` has non-coloured non-neighbours (while row `i` contains 0) - while (!std::accumulate(adj.begin() + i * size, adj.begin() + (i + 1) * size, true, std::logical_and())) { + while ((i + 1 < size) && !std::accumulate(adj.begin() + i * size, adj.begin() + (i + 1) * size, true, std::logical_and())) { size_t j = i + 1; // Find first non-adjacent and non-visited (non-colored) Buffer to color him to the same color for (; j < size; ++j) { @@ -220,14 +220,10 @@ auto SetBufferRegGroup::coloring(BufferPool& buffers, std::vector& adj) -> bool SetBufferRegGroup::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SetBufferRegGroup") // Identify Buffers using Graph coloring algorithm. - BufferPool buffer_pool; - - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto& expr = *expr_it; - if (ov::is_type(expr->get_node())) { - buffer_pool.push_back(expr); - } - } + BufferPool buffer_pool = linear_ir.get_buffers(); + // For the better coloring Buffers should be stored in the order of execution numbers + std::sort(buffer_pool.begin(), buffer_pool.end(), + [](const BufferExpressionPtr& lhs, const BufferExpressionPtr& rhs) { return lhs->get_exec_num() < rhs->get_exec_num(); }); // Creation of Adj matrix auto adj = create_adjacency_matrix(begin, end, buffer_pool); @@ -238,9 +234,8 @@ bool SetBufferRegGroup::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt for (const auto& pair : color_groups) { const auto color = pair.first; const auto& united_buffers = pair.second; - for (const auto& buffer_expr : united_buffers) { - ov::as_type_ptr(buffer_expr->get_node())->set_reg_group(color); - } + for (const auto& buffer_expr : united_buffers) + buffer_expr->set_reg_group(color); } return true; diff --git a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp index 2a6b68738f7a68..ca85cefd369099 100644 --- a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp +++ b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp @@ -28,22 +28,17 @@ std::map create_execution_number_mapping(const LinearIR& linear_ir) } } // namespace -std::pair SolveBufferMemory::extract_static_and_dynamic_buffers(const LinearIR::container& buffer_expressions) { - LinearIR::container static_buffer_exprs, dynamic_buffer_exprs; +std::pair SolveBufferMemory::extract_static_and_dynamic_buffers(const Buffers& buffer_expressions) { + Buffers static_buffer_exprs, dynamic_buffer_exprs; for (const auto& buffer_expr : buffer_expressions) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - - auto& clusters = buffer->is_defined() ? static_buffer_exprs : dynamic_buffer_exprs; + auto& clusters = buffer_expr->is_defined() ? static_buffer_exprs : dynamic_buffer_exprs; clusters.push_back(buffer_expr); } // Validation check that buffer cluster has only static or dynamic buffers. for (const auto& static_buffer : static_buffer_exprs) { - const auto static_cluster_id = ov::as_type_ptr(static_buffer->get_node())->get_cluster_id(); - auto is_cluster_ids_the_same = [&static_cluster_id](const ExpressionPtr& expr) { - return static_cluster_id == ov::as_type_ptr(expr->get_node())->get_cluster_id(); - }; + const auto static_cluster_id = static_buffer->get_cluster_id(); + auto is_cluster_ids_the_same = [&static_cluster_id](const BufferExpressionPtr& expr) { return static_cluster_id == expr->get_cluster_id(); }; OPENVINO_ASSERT(std::none_of(dynamic_buffer_exprs.cbegin(), dynamic_buffer_exprs.cend(), is_cluster_ids_the_same), "There is Buffer cluster with buffers which has defined and undefined allocation sizes"); } @@ -51,7 +46,7 @@ std::pair SolveBufferMemory::extract_s return { static_buffer_exprs, dynamic_buffer_exprs }; } -std::vector SolveBufferMemory::init_boxes(const LinearIR::container& buffer_expressions, const LinearIR& linear_ir) { +std::vector SolveBufferMemory::init_boxes(const Buffers& buffer_expressions, const LinearIR& linear_ir) { // ov::MemorySolver interface requires integer execution numbers (lifetime must be integer). // To align with ov::MemorySolver interface, we create the map [double -> integer] const auto int_execution_numbers = create_execution_number_mapping(linear_ir); @@ -63,9 +58,7 @@ std::vector SolveBufferMemory::init_boxes(const LinearIR: std::map map_boxes; for (const auto& buffer_expr : buffer_expressions) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - auto cluster_id = static_cast(buffer->get_cluster_id()); + auto cluster_id = static_cast(buffer_expr->get_cluster_id()); if (map_boxes.count(cluster_id) == 0) { map_boxes[cluster_id] = { std::numeric_limits::max(), 0, 0, cluster_id }; @@ -98,7 +91,7 @@ std::vector SolveBufferMemory::init_boxes(const LinearIR: } OPENVINO_ASSERT(e_start <= e_finish, "Incorrect life time of buffer!"); - auto buffer_size = static_cast(buffer->get_byte_size()); + auto buffer_size = static_cast(buffer_expr->get_byte_size()); box.size = std::max(buffer_size, box.size); box.start = std::min(e_start, box.start); @@ -119,7 +112,7 @@ std::vector SolveBufferMemory::init_boxes(const LinearIR: return boxes; } -void SolveBufferMemory::solve_static_buffer_memory(const LinearIR::container& static_buffer_expressions, const LinearIR& linear_ir) { +void SolveBufferMemory::solve_static_buffer_memory(const Buffers& static_buffer_expressions, const LinearIR& linear_ir) { const auto boxes = init_boxes(static_buffer_expressions, linear_ir); ov::MemorySolver memSolver(boxes); @@ -127,37 +120,28 @@ void SolveBufferMemory::solve_static_buffer_memory(const LinearIR::container& st // Set offsets for Buffers for (const auto& buffer_expr : static_buffer_expressions) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - - const auto offset = static_cast(memSolver.get_offset(static_cast(buffer->get_cluster_id()))); - buffer->set_offset(offset * m_alignment); // alignment in byte + const auto offset = static_cast(memSolver.get_offset(static_cast(buffer_expr->get_cluster_id()))); + buffer_expr->set_offset(offset * m_alignment); // alignment in byte } } -void SolveBufferMemory::set_dynamic_buffer_offset(const LinearIR::container& dynamic_buffer_expressions) { +void SolveBufferMemory::set_dynamic_buffer_offset(const Buffers& dynamic_buffer_expressions) { size_t offset = utils::get_dynamic_value(); // If there are not allocated memory for static buffers in LinearIR and there is only one cluster of dynamic buffer exprs, // we can force offset = 0 if (m_static_buffer_scratchpad_size == 0) { std::set dynamic_clusters; - for (const auto& dynamic_buffer_expr : dynamic_buffer_expressions) { - const auto& buffer = ov::as_type_ptr(dynamic_buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - dynamic_clusters.insert(buffer->get_cluster_id()); - } + for (const auto& dynamic_buffer_expr : dynamic_buffer_expressions) + dynamic_clusters.insert(dynamic_buffer_expr->get_cluster_id()); + if (dynamic_clusters.size() == 1) offset = 0; } // Set offsets for Buffers - for (const auto& buffer_expr : dynamic_buffer_expressions) { - const auto& buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Buffer clusters expects Buffer nodes"); - - buffer->set_offset(offset); - } + for (const auto& buffer_expr : dynamic_buffer_expressions) + buffer_expr->set_offset(offset); } bool SolveBufferMemory::run(LinearIR& linear_ir) { @@ -165,7 +149,7 @@ bool SolveBufferMemory::run(LinearIR& linear_ir) { // TODO [143395] : MemoryManager will be able to return two containers with dynamic and static buffers // without additional `extract` functions in all passes - LinearIR::container static_buffer_exprs, dynamic_buffer_exprs; + Buffers static_buffer_exprs, dynamic_buffer_exprs; std::tie(static_buffer_exprs, dynamic_buffer_exprs) = extract_static_and_dynamic_buffers(linear_ir.get_buffers()); if (!static_buffer_exprs.empty()) diff --git a/src/common/snippets/src/lowered/pass/validate.cpp b/src/common/snippets/src/lowered/pass/validate.cpp index 24fff8ab0fc00b..2e9e5813c03264 100644 --- a/src/common/snippets/src/lowered/pass/validate.cpp +++ b/src/common/snippets/src/lowered/pass/validate.cpp @@ -64,10 +64,12 @@ void validate_result(const ExpressionPtr& expr, const LinearIR& linear_ir) { void validate_buffer(const ExpressionPtr& expr, const LinearIR& linear_ir) { OPENVINO_ASSERT(ov::is_type(expr->get_node()), "Buffer validation expects Buffer op"); + OPENVINO_ASSERT(ov::is_type(expr), + "Buffer validation expects Buffer expression"); for (const auto& input : expr->get_input_port_connectors()) { const auto& source = input->get_source(); const auto ma = std::dynamic_pointer_cast(source.get_expr()->get_node()); - OPENVINO_ASSERT(ma && ma->is_memory_access_input_port(source.get_index()), + OPENVINO_ASSERT(ma && ma->is_memory_access_output_port(source.get_index()), "Buffer expects MemoryAccess parent"); const auto buffer_siblings = input->get_consumers(); for (const auto& buffer_sibling : buffer_siblings) { @@ -124,39 +126,6 @@ void validate_loop_end(const ExpressionPtr& expr, const LinearIR& linear_ir) { validate_loop_ports(input_port_infos); validate_loop_ports(output_port_infos, loop_end->get_input_num()); } - -// TODO [143395] : Extract this validation checks to the separate `ValidateBuffers` pass -void validate_buffer_expressions(const LinearIR::container& buffer_expressions) { - std::set cluster_ids; - std::map> dynamic_buffer_clusters, static_buffer_clusters; - - for (const auto& buffer_expr : buffer_expressions) { - const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Expected Buffer ops in Buffer expressions of LinearIR"); - - // TODO [143395] : MemoryManager should provide exact containers with needed buffers (static or dynamic) without any `is_defined()` - auto& clusters = buffer->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters; - clusters[buffer->get_cluster_id()].insert(buffer_expr); - cluster_ids.insert(buffer->get_cluster_id()); - } - - OPENVINO_ASSERT(cluster_ids.size() == dynamic_buffer_clusters.size() + static_buffer_clusters.size(), "Incorrect count of Buffer clusters"); - OPENVINO_ASSERT(cluster_ids.empty() || (*cluster_ids.cbegin() == 0 && *cluster_ids.crbegin() == (cluster_ids.size() - 1)), - "Incorrect indetifiers of Buffer clusters"); - - for (const auto& p : static_buffer_clusters) { - const auto& cluster_id = p.first; - const auto& cluster = p.second; - OPENVINO_ASSERT(dynamic_buffer_clusters.count(cluster_id) == 0, "Buffers from the same cluster must be only static or dynamic"); - - OPENVINO_ASSERT(cluster.size() > 0, "Incorrect size of buffer cluster"); - size_t cluster_offset = ov::as_type_ptr((*cluster.cbegin())->get_node())->get_offset(); - for (const auto& buffer_expr : cluster) { - OPENVINO_ASSERT(cluster_offset == ov::as_type_ptr(buffer_expr->get_node())->get_offset(), - "Static Buffers from the same cluster must have the same offset!"); - } - } -} } // namespace Validate::Validate() { @@ -188,8 +157,6 @@ bool Validate::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lo prev_exec_order = expr->get_exec_num(); } - validate_buffer_expressions(linear_ir.get_buffers()); - return false; } diff --git a/src/common/snippets/src/lowered/pass/validate_buffers.cpp b/src/common/snippets/src/lowered/pass/validate_buffers.cpp new file mode 100644 index 00000000000000..e955f10ca09ad4 --- /dev/null +++ b/src/common/snippets/src/lowered/pass/validate_buffers.cpp @@ -0,0 +1,61 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/validate_buffers.hpp" + +#include "snippets/utils/utils.hpp" +#include "snippets/itt.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +bool ValidateBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ValidateBuffers") + + const auto& lir_buffers = linear_ir.get_buffers(); + + // Firstly we check that all BufferExpression are in "get_buffers()" + for (const auto& expr : linear_ir) { + if (const auto& buffer_expr = ov::as_type_ptr(expr)) + OPENVINO_ASSERT(std::find(lir_buffers.cbegin(), lir_buffers.cend(), buffer_expr) != lir_buffers.cend(), + "All BufferExpressions must be in LinearIR.get_buffers()"); + } + + // Secondly we should validate buffers and their clusters + std::set cluster_ids; + std::map> dynamic_buffer_clusters, static_buffer_clusters; + for (const auto& buffer_expr : lir_buffers) { + // TODO [143395] : MemoryManager should provide exact containers with needed buffers (static or dynamic) without any `is_defined()` + auto& clusters = buffer_expr->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters; + clusters[buffer_expr->get_cluster_id()].insert(buffer_expr); + cluster_ids.insert(buffer_expr->get_cluster_id()); + + buffer_expr->validate(); + } + + OPENVINO_ASSERT(cluster_ids.size() == dynamic_buffer_clusters.size() + static_buffer_clusters.size(), "Incorrect count of Buffer clusters"); + OPENVINO_ASSERT(cluster_ids.empty() || (*cluster_ids.cbegin() == 0 && *cluster_ids.crbegin() == (cluster_ids.size() - 1)), + "Incorrect indetifiers of Buffer clusters"); + + for (const auto& p : static_buffer_clusters) { + const auto& cluster_id = p.first; + const auto& cluster = p.second; + OPENVINO_ASSERT(dynamic_buffer_clusters.count(cluster_id) == 0, "Buffers from the same cluster must be only static or dynamic"); + + OPENVINO_ASSERT(cluster.size() > 0, "Incorrect size of buffer cluster"); + size_t cluster_offset = (*cluster.cbegin())->get_offset(); + for (const auto& buffer_expr : cluster) { + OPENVINO_ASSERT(cluster_offset == buffer_expr->get_offset(), "Static Buffers from the same cluster must have the same offset!"); + } + } + + return !lir_buffers.empty(); +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp index 0c7403cd56f6f5..a99a75dc012f81 100644 --- a/src/common/snippets/src/op/buffer.cpp +++ b/src/common/snippets/src/op/buffer.cpp @@ -13,88 +13,92 @@ namespace ov { namespace snippets { namespace op { -Buffer::Buffer(const OutputVector& arguments, size_t allocation_size, size_t reg_group, size_t cluster_id) - : Op(arguments), m_allocation_size(allocation_size), m_reg_group(reg_group), m_cluster_id(cluster_id), m_offset(0) { +Buffer::Buffer(const ov::Output& arg) : Buffer(ov::OutputVector{arg}) {} + +Buffer::Buffer(const OutputVector& arguments) : Op(arguments), m_impl(std::make_shared()) { + constructor_validate_and_infer_types(); +} +Buffer::Buffer(const ov::Shape& shape, ov::element::Type element_type) : Op(), m_impl(std::make_shared(shape, element_type)) { + constructor_validate_and_infer_types(); +} +Buffer::Buffer(const OutputVector& arguments, std::shared_ptr impl) : Op(arguments), m_impl(std::move(impl)) { constructor_validate_and_infer_types(); } bool Buffer::visit_attributes(AttributeVisitor& visitor) { INTERNAL_OP_SCOPE(Buffer_visit_attributes); - auto element_type = get_element_type(); - auto allocation_size = utils::value2str(m_allocation_size); - auto offset = utils::value2str(m_offset); - visitor.on_attribute("allocation_size", allocation_size); - visitor.on_attribute("offset", offset); - visitor.on_attribute("reg_group", m_reg_group); - visitor.on_attribute("cluster_id", m_cluster_id); - visitor.on_attribute("element_type", element_type); + m_impl->visit_attributes(visitor); return true; } -bool Buffer::is_defined() const { - return !utils::is_dynamic_value(m_allocation_size); +void Buffer::validate_and_infer_types() { + INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); + m_impl->validate_and_infer_types(this); } -size_t Buffer::get_byte_size() const { - if (is_defined()) - return m_allocation_size * get_element_type().size(); - return utils::get_dynamic_value(); +std::shared_ptr Buffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); + return std::shared_ptr(new Buffer(new_args, m_impl->clone())); } -IntermediateMemoryBuffer::IntermediateMemoryBuffer(const OutputVector& arguments, size_t allocation_size, size_t reg_group, size_t cluster_id) - : Buffer(arguments, allocation_size, reg_group, cluster_id) { - constructor_validate_and_infer_types(); +Buffer::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { + const auto& buffer = ov::as_type_ptr(n); + OPENVINO_ASSERT(buffer, "Got invalid node in Buffer::ShapeInfer"); + m_impl_shape_infer = buffer->m_impl->get_shape_infer(); } -IntermediateMemoryBuffer::IntermediateMemoryBuffer(const ov::Output& arg, size_t allocation_size, size_t reg_group, size_t cluster_id) - : IntermediateMemoryBuffer(OutputVector{arg}, allocation_size, reg_group, cluster_id) {} +IShapeInferSnippets::Result Buffer::ShapeInfer::infer(const std::vector& input_shapes) { + return m_impl_shape_infer->infer(input_shapes); +} -void IntermediateMemoryBuffer::validate_and_infer_types() { - INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); - ov::PartialShape output_shape; - set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +std::shared_ptr Buffer::IntermediateMemoryImpl::clone() const { + return std::make_shared(); } -std::shared_ptr IntermediateMemoryBuffer::clone_with_new_inputs(const OutputVector& new_args) const { - INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); - auto new_buffer = std::make_shared(new_args, m_allocation_size, m_reg_group, m_cluster_id); - new_buffer->set_offset(m_offset); - return new_buffer; +void Buffer::IntermediateMemoryImpl::validate_and_infer_types(Buffer* buffer) const { + OPENVINO_ASSERT(buffer, "Buffer is missed"); + OPENVINO_ASSERT(buffer->get_input_size() != 0, "IntermediateMemory Buffer must have inputs"); + const auto inputs = buffer->input_values(); + const auto& inshape = buffer->get_input_partial_shape(0); + const auto& intype = buffer->get_input_element_type(0); + OPENVINO_ASSERT(std::all_of(inputs.cbegin() + 1, inputs.cend(), + [&](const ov::Output& in) { return in.get_partial_shape() == inshape && in.get_element_type() == intype; }), + "All inputs of Buffers must have the same shape and element type"); + buffer->set_output_type(0, intype, inshape); } -NewMemoryBuffer::NewMemoryBuffer(const ov::Shape& shape, size_t reg_group, size_t cluster_id, ov::element::Type element_type) - : Buffer({}, ov::shape_size(shape), reg_group, cluster_id), m_output_shape(shape), m_element_type(element_type) { - constructor_validate_and_infer_types(); +Buffer::IntermediateMemoryImpl::ShapeInfer::Result Buffer::IntermediateMemoryImpl::ShapeInfer::infer(const std::vector& input_shapes) { + OPENVINO_ASSERT(!input_shapes.empty(), "IntermediateMemoryBuffer shape inference must have input shapes"); + return {{input_shapes[0].get()}, ShapeInferStatus::success}; } -void NewMemoryBuffer::validate_and_infer_types() { - INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); - OPENVINO_ASSERT(get_input_size() == 0, "Buffer with new allocated memory mustn't have arguments!"); - set_output_type(0, m_element_type, m_output_shape); +Buffer::NewMemoryImpl::NewMemoryImpl(const ov::Shape& shape, ov::element::Type element_type) + : m_shape(shape), m_element_type(element_type) {} + +size_t Buffer::NewMemoryImpl::get_allocation_size() const { + return ov::shape_size(m_shape); } -std::shared_ptr NewMemoryBuffer::clone_with_new_inputs(const OutputVector& new_args) const { - INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); - check_new_args_count(this, new_args); - auto new_buffer = std::make_shared(m_output_shape, m_reg_group, m_cluster_id, m_element_type); - new_buffer->set_offset(m_offset); - return new_buffer; +std::shared_ptr Buffer::NewMemoryImpl::clone() const { + return std::make_shared(m_shape, m_element_type); } -void NewMemoryBuffer::set_element_type(ov::element::Type element_type) { - m_element_type = std::move(element_type); - // Apply the change - validate_and_infer_types(); +void Buffer::NewMemoryImpl::validate_and_infer_types(Buffer* buffer) const { + OPENVINO_ASSERT(buffer, "Buffer is missed"); + OPENVINO_ASSERT(buffer->get_input_size() == 0, "NewMemory Buffer mustn't have inputs"); + buffer->set_output_type(0, m_element_type, m_shape); } -NewMemoryBuffer::ShapeInfer::ShapeInfer(const std::shared_ptr& n) { - const auto& buffer = ov::as_type_ptr(n); - OPENVINO_ASSERT(buffer, "Got invalid node in NewMemoryBuffer::ShapeInfer"); - m_shape = buffer->get_shape(); +bool Buffer::NewMemoryImpl::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("shape", m_shape); + visitor.on_attribute("element_type", m_element_type); + return true; } -IShapeInferSnippets::Result NewMemoryBuffer::ShapeInfer::infer(const std::vector& input_shapes) { - OPENVINO_ASSERT(input_shapes.empty(), "NewMemoryBuffer shape inference mustn't have input shapes"); +Buffer::NewMemoryImpl::ShapeInfer::ShapeInfer(ov::Shape shape) : m_shape(std::move(shape)) {} + +Buffer::NewMemoryImpl::ShapeInfer::Result Buffer::NewMemoryImpl::ShapeInfer::infer(const std::vector& input_shapes) { + OPENVINO_ASSERT(input_shapes.empty(), "NewMemoryBuffer shape inference must have input shapes"); return {{m_shape}, ShapeInferStatus::success}; } diff --git a/src/common/snippets/src/op/serialization_node.cpp b/src/common/snippets/src/op/serialization_node.cpp index c136acea975a42..1718f770ad62d6 100644 --- a/src/common/snippets/src/op/serialization_node.cpp +++ b/src/common/snippets/src/op/serialization_node.cpp @@ -40,83 +40,7 @@ std::shared_ptr SerializationNode::clone_with_new_inputs(const OutputVecto } bool SerializationNode::visit_attributes(AttributeVisitor &visitor) { - auto is_planar_layout = [](const std::vector& layout) { - for (size_t i = 0; i < layout.size(); ++i) - if (layout[i] != i) return false; - return true; - }; - auto subtensor2str = [](const VectorDims& subtensor) { - std::stringstream ss; - for (size_t i = 0; i < subtensor.size(); ++i) { - const auto& v = subtensor[i]; - const auto v_str = utils::is_full_dim_value(v) ? "FULL_DIM" : - utils::is_dynamic_value(v) ? "?" : std::to_string(v); - const auto del = i < subtensor.size() - 1 ? ", " : ""; - ss << v_str << del; - } - return ss.str(); - }; - - std::vector in_regs, out_regs; - std::vector in_reg_types, out_reg_types; - std::vector> shapes; - std::vector> subtensors; - std::vector>> layouts; - for (size_t i = 0; i < m_expr->get_input_count(); i++) { - const auto& desc = m_expr->get_input_port_descriptor(i); - const auto& shape = desc->get_shape(); - if (!shape.empty()) - shapes.emplace_back("in_shape_" + std::to_string(i), ov::PartialShape(shape)); - - const auto& subtensor = desc->get_subtensor(); - if (!subtensor.empty()) - subtensors.emplace_back("in_subtensor_" + std::to_string(i), subtensor2str(subtensor)); - - const auto& layout = desc->get_layout(); - if (!layout.empty() && !is_planar_layout(layout)) - layouts.emplace_back("in_layout_" + std::to_string(i), layout); - - in_reg_types.emplace_back(regTypeToStr(desc->get_reg().type)); - in_regs.emplace_back(desc->get_reg().idx); - } - for (size_t i = 0; i < m_expr->get_output_count(); i++) { - const auto& desc = m_expr->get_output_port_descriptor(i); - const auto& shape = desc->get_shape(); - if (!shape.empty()) - shapes.emplace_back("out_shape_" + std::to_string(i), ov::PartialShape(shape)); - - const auto& subtensor = desc->get_subtensor(); - if (!subtensor.empty()) - subtensors.emplace_back("out_subtensor_" + std::to_string(i), subtensor2str(subtensor)); - - const auto& layout = desc->get_layout(); - if (!layout.empty() && !is_planar_layout(layout)) - layouts.emplace_back("out_layout_" + std::to_string(i), layout); - - out_reg_types.emplace_back(regTypeToStr(desc->get_reg().type)); - out_regs.emplace_back(desc->get_reg().idx); - } - - if (!in_regs.empty()) { - visitor.on_attribute("in_regs", in_regs); - visitor.on_attribute("in_reg_types", in_reg_types); - } - if (!out_regs.empty()) { - visitor.on_attribute("out_regs", out_regs); - visitor.on_attribute("out_reg_types", out_reg_types); - } - for (auto& s : shapes) - visitor.on_attribute(s.first, s.second); - for (auto& s : subtensors) - visitor.on_attribute(s.first, s.second); - for (auto& s : layouts) - visitor.on_attribute(s.first, s.second); - auto loop_ids = m_expr->get_loop_ids(); - visitor.on_attribute("loop_ids", loop_ids); - auto exec_num = m_expr->get_exec_num(); - visitor.on_attribute("execution_number", exec_num); - m_expr->get_node()->visit_attributes(visitor); - return true; + return m_expr->visit_attributes(visitor); } } // namespace op diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index cf9f6b3121782e..98e3392a65e1e2 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -44,6 +44,7 @@ #include "snippets/lowered/pass/optimize_domain.hpp" #include "snippets/lowered/pass/insert_perf_count.hpp" #include "snippets/lowered/pass/validate_shapes.hpp" +#include "snippets/lowered/pass/validate_buffers.hpp" #include "snippets/lowered/pass/validate.hpp" #include "snippets/lowered/pass/pass_config.hpp" #include "snippets/lowered/pass/reduce_decomposition.hpp" @@ -472,9 +473,13 @@ void Subgraph::control_flow_transformations(size_t min_parallel_work_amount, siz pipeline.register_pass(m_linear_ir->get_config().m_are_buffers_optimized); pipeline.register_pass(); pipeline.register_positioned_passes(lowered_backend_passes); - pipeline.register_pass(); // must be last pipeline.run(*m_linear_ir); + lowered::pass::PassPipeline validation_pipeline; + validation_pipeline.register_pass(); + validation_pipeline.register_pass(); + validation_pipeline.run(*m_linear_ir); + #ifdef SNIPPETS_DEBUG_CAPS if (m_linear_ir->get_config().debug_config.perf_count_mode != DebugCapsConfig::PerfCountMode::Disabled) { lowered::pass::InsertPerfCount perf_count_pass({}); diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index f955780ca40a97..609005a9b86d80 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -134,17 +134,14 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir) } void RuntimeConfigurator::init_buffer_info(const lowered::LinearIRCPtr& linear_ir) { - std::map> dynamic_buffer_clusters, static_buffer_clusters; + std::map> dynamic_buffer_clusters, static_buffer_clusters; // All needed checks are in Validate pass const auto& buffer_expressions = linear_ir->get_buffers(); for (const auto& buffer_expr : buffer_expressions) { - const auto buffer = ov::as_type_ptr(buffer_expr->get_node()); - OPENVINO_ASSERT(buffer, "Expected Buffer ops in Buffer expressions of LinearIR"); - // TODO [143395] : MemoryManager should provide exact containers with needed buffers (static or dynamic) without any `is_defined()` - auto& clusters = buffer->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters; - clusters[buffer->get_cluster_id()].insert(buffer_expr); + auto& clusters = buffer_expr->is_defined() ? static_buffer_clusters : dynamic_buffer_clusters; + clusters[buffer_expr->get_cluster_id()].insert(buffer_expr); } const auto cluster_count = dynamic_buffer_clusters.size() + static_buffer_clusters.size(); @@ -156,7 +153,7 @@ void RuntimeConfigurator::init_buffer_info(const lowered::LinearIRCPtr& linear_i const auto& cluster = p.second; OPENVINO_ASSERT(cluster.size() > 0, "Incorrect size of buffer cluster"); - size_t cluster_offset = ov::as_type_ptr((*cluster.cbegin())->get_node())->get_offset(); + size_t cluster_offset = (*cluster.cbegin())->get_offset(); m_config->buffer_cluster_offsets[cluster_id] = cluster_offset; } @@ -246,7 +243,8 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRC // No need to calculate allocation size of Buffers which are in Loops with `work_amount = 0` - they won't be executed if (is_not_executed(buffer_expr)) continue; - const auto& allocation_size = lowered::pass::ComputeBufferAllocationSize::get_allocation_size(loop_manager, buffer_expr, m_config->tile_rank); + buffer_expr->init_allocation_size(loop_manager, m_config->tile_rank); + const auto& allocation_size = buffer_expr->get_allocation_size(); OPENVINO_ASSERT(!utils::is_dynamic_value(allocation_size), "Buffer scratchpad size must be defined!"); additional_size = std::max(allocation_size * buffer_expr->get_node()->get_element_type().size(), additional_size); } diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp index ff42dae602a54f..76a4c491c66983 100644 --- a/src/common/snippets/src/shape_inference/shape_inference.cpp +++ b/src/common/snippets/src/shape_inference/shape_inference.cpp @@ -39,7 +39,6 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry SHAPE_INFER_PREDEFINED(op::ConvertSaturation, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(op::Load, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(op::Store, PassThroughShapeInfer), - SHAPE_INFER_PREDEFINED(op::IntermediateMemoryBuffer, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(op::Fill, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(ov::op::v0::Parameter, PassThroughShapeInfer), SHAPE_INFER_PREDEFINED(ov::op::v1::LogicalNot, PassThroughShapeInfer), @@ -70,7 +69,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry SHAPE_INFER_OP_SPECIFIC(op::RankNormalization), SHAPE_INFER_OP_SPECIFIC(op::BroadcastLoad), SHAPE_INFER_OP_SPECIFIC(op::BroadcastMove), - SHAPE_INFER_OP_SPECIFIC(op::NewMemoryBuffer), + SHAPE_INFER_OP_SPECIFIC(op::Buffer), }; #undef SHAPE_INFER_OP_SPECIFIC_EXTERNAL #undef SHAPE_INFER_OP_SPECIFIC diff --git a/src/common/snippets/tests/src/lir_comparator.cpp b/src/common/snippets/tests/src/lir_comparator.cpp index 3552c81cdf40a5..82d5b9dcf91441 100644 --- a/src/common/snippets/tests/src/lir_comparator.cpp +++ b/src/common/snippets/tests/src/lir_comparator.cpp @@ -61,10 +61,7 @@ LIRComparator::Result LIRComparator::compare(const LinearIRPtr& linear_ir, const auto& buffers_ref = linear_ir_ref->get_buffers(); COMPARE("Number of buffers", buffers.size(), buffers_ref.size()); - auto run_comparison = [&](const LinearIR::constExprIt& expr_it, const LinearIR::constExprIt& expr_it_ref) { - const auto& expr = expr_it->get(); - const auto& expr_ref = expr_it_ref->get(); - + auto run_comparison = [&](const ExpressionPtr& expr, const ExpressionPtr& expr_ref) { const auto node = expr->get_node(); const auto node_ref = expr_ref->get_node(); if (m_nodes_cmp_values != NodesCmpValues::NONE) @@ -90,11 +87,11 @@ LIRComparator::Result LIRComparator::compare(const LinearIRPtr& linear_ir, }; for (auto param_it = parameters.begin(), param_it_ref = parameters_ref.begin(); param_it != parameters.end(); ++param_it, ++param_it_ref) - PROPAGATE_ERROR("", run_comparison(param_it, param_it_ref)); + PROPAGATE_ERROR("", run_comparison(*param_it, *param_it_ref)); for (auto expr_it = ops.begin(), expr_it_ref = ops_ref.begin(); expr_it != ops.end(); ++expr_it, ++expr_it_ref) - PROPAGATE_ERROR("", run_comparison(expr_it, expr_it_ref)); + PROPAGATE_ERROR("", run_comparison(*expr_it, *expr_it_ref)); for (auto result_it = results.begin(), result_it_ref = results_ref.begin(); result_it != results.end(); ++result_it, ++result_it_ref) - PROPAGATE_ERROR("", run_comparison(result_it, result_it_ref)); + PROPAGATE_ERROR("", run_comparison(*result_it, *result_it_ref)); if (should_compare(LIRCmpValues::LOOP_MANAGER)) { PROPAGATE_ERROR("Loop managers", compare_loop_managers(linear_ir->get_loop_manager(), linear_ir_ref->get_loop_manager())); diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp index 4dc3f2dae7e867..ac521631917897 100644 --- a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp +++ b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp @@ -82,11 +82,9 @@ void BufferAllocationTest::ApplyTransformations(const std::shared_ptr reg_groups, clusters; - for (const auto& expr : m_linear_ir) { - if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - reg_groups.insert(buffer->get_reg_group()); - clusters.insert(buffer->get_cluster_id()); - } + for (const auto& buffer_expr : m_linear_ir.get_buffers()) { + reg_groups.insert(buffer_expr->get_reg_group()); + clusters.insert(buffer_expr->get_cluster_id()); } EXPECT_EQ(reg_groups.size(), m_expected_reg_group_count); EXPECT_EQ(clusters.size(), m_expected_cluster_count); @@ -100,9 +98,9 @@ std::shared_ptr EltwiseBufferAllocationTest::GetModel() const { const auto parameter0 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); const auto parameter1 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); const auto add = std::make_shared(parameter0, parameter1); - const auto buffer0 = std::make_shared(add); + const auto buffer0 = std::make_shared(add); const auto relu = std::make_shared(buffer0); - const auto buffer1 = std::make_shared(relu); + const auto buffer1 = std::make_shared(relu); const auto exp = std::make_shared(buffer1); const auto body = std::make_shared(std::make_shared(exp), ov::ParameterVector{parameter0, parameter1}); diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index 136dccb5fac667..e9ed04bf8da5a4 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -51,8 +51,7 @@ DummyTargetMachine::DummyTargetMachine(const std::vector& jitters[ov::snippets::op::PerfCountEnd::get_type_info_static()] = dummy_functor; #endif jitters[ov::snippets::op::Brgemm::get_type_info_static()] = dummy_functor; - jitters[ov::snippets::op::IntermediateMemoryBuffer::get_type_info_static()] = dummy_functor; - jitters[ov::snippets::op::NewMemoryBuffer::get_type_info_static()] = dummy_functor; + jitters[ov::snippets::op::Buffer::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::Fill::get_type_info_static()] = dummy_functor; jitters[ov::snippets::op::ReduceMax::get_type_info_static()] = dummy_functor; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp index 9345b79c37e710..8f7a54dc9ebdb3 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_kernel_emitter.cpp @@ -38,6 +38,7 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov jcp = *reinterpret_cast(kernel->compile_params); const auto& parameters = body->get_parameters(); const auto& results = body->get_results(); + const auto& buffers = body->get_buffers(); num_inputs = parameters.size(); num_outputs = results.size(); for (const auto& param : parameters) @@ -46,19 +47,22 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov mem_access_exprs.push_back(result); std::set unique_buffers; - for (const auto& expr : *body) { - if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - const auto buffer_id = buffer->get_cluster_id(); - if (unique_buffers.count(buffer_id) == 0) { - mem_access_exprs.push_back(expr); - unique_buffers.insert(buffer_id); - } - } else { - if (std::find(parameters.cbegin(), parameters.cend(), expr) == parameters.cend() && - std::find(results.cbegin(), results.cend(), expr) == results.cend()) - general_exprs.emplace_back(expr); + for (const auto& buffer_expr : buffers) { + const auto buffer_reg_group = buffer_expr->get_reg_group(); + if (unique_buffers.count(buffer_reg_group) == 0) { + mem_access_exprs.push_back(buffer_expr); + unique_buffers.insert(buffer_reg_group); } } + + using ExprSet = std::unordered_set; + const ExprSet params_set(parameters.cbegin(), parameters.cend()); + const ExprSet results_set(results.cbegin(), results.cend()); + const ExprSet buffers_set(buffers.cbegin(), buffers.cend()); + for (const auto& expr : *body) { + if (params_set.count(expr) == 0 && results_set.count(expr) == 0 && buffers_set.count(expr) == 0) + general_exprs.emplace_back(expr); + } num_unique_buffers = unique_buffers.size(); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index 01a87d849f9731..1da6cd7121487f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -159,8 +159,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho // data movement jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); - jitters[snippets::op::IntermediateMemoryBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); - jitters[snippets::op::NewMemoryBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); + jitters[snippets::op::Buffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::VectorBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp index ff38c5586af106..4c36aa3b21ab35 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp @@ -41,7 +41,7 @@ jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, "Jit emitter is called when the shapes are unknown"); auto get_cluster_id = [](const snippets::lowered::ExpressionPort& p) { // Note: NewMemoryBuffer is used as a scratchpad and can't be dynamic, so we don't need to account for them here - if (const auto buffer = ov::as_type_ptr(p.get_expr()->get_node())) + if (const auto buffer = ov::as_type_ptr(p.get_expr())) return buffer->get_cluster_id(); else return SIZE_MAX; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp index ff58ef8b0a5bcb..4a27c8e17150e8 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp @@ -23,6 +23,7 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov jcp = *reinterpret_cast(kernel->compile_params); const auto& parameters = body->get_parameters(); const auto& results = body->get_results(); + const auto& buffers = body->get_buffers(); num_inputs = parameters.size(); num_outputs = results.size(); for (const auto& param : parameters) @@ -31,19 +32,22 @@ jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov mem_access_exprs.push_back(result); std::set unique_buffers; - for (const auto& expr : *body) { - if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - const auto buffer_reg_group = buffer->get_reg_group(); - if (unique_buffers.count(buffer_reg_group) == 0) { - mem_access_exprs.push_back(expr); - unique_buffers.insert(buffer_reg_group); - } - } else { - if (std::find(parameters.cbegin(), parameters.cend(), expr) == parameters.cend() && - std::find(results.cbegin(), results.cend(), expr) == results.cend()) - general_exprs.emplace_back(expr); + for (const auto& buffer_expr : buffers) { + const auto buffer_reg_group = buffer_expr->get_reg_group(); + if (unique_buffers.count(buffer_reg_group) == 0) { + mem_access_exprs.push_back(buffer_expr); + unique_buffers.insert(buffer_reg_group); } } + + using ExprSet = std::unordered_set; + const ExprSet params_set(parameters.cbegin(), parameters.cend()); + const ExprSet results_set(results.cbegin(), results.cend()); + const ExprSet buffers_set(buffers.cbegin(), buffers.cend()); + for (const auto& expr : *body) { + if (params_set.count(expr) == 0 && results_set.count(expr) == 0 && buffers_set.count(expr) == 0) + general_exprs.emplace_back(expr); + } num_unique_buffers = unique_buffers.size(); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp index 1d8c26e3d709fa..f2fd978edc6aaf 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp @@ -60,7 +60,7 @@ size_t jit_memory_emitter::aux_gprs_count() const { size_t jit_memory_emitter::get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { OV_CPU_JIT_EMITTER_ASSERT(expr->get_input_port_connectors().size() == 1, "MemoryAccess must have one parent"); const auto& parent_expr = expr->get_input_port_connector(0)->get_source().get_expr(); - if (const auto buffer = ov::as_type_ptr(parent_expr->get_node())) { + if (const auto buffer = ov::as_type_ptr(parent_expr)) { return buffer->get_cluster_id(); } return SIZE_MAX; @@ -70,7 +70,7 @@ size_t jit_memory_emitter::get_consumer_buffer_cluster_id(const ov::snippets::lo OV_CPU_JIT_EMITTER_ASSERT(expr->get_output_port_connectors().size() == 1, "MemoryAccess must have one consumer"); const auto& consumers = expr->get_output_port_connector(0)->get_consumers(); for (const auto& consumer : consumers) - if (const auto buffer = ov::as_type_ptr(consumer.get_expr()->get_node())) + if (const auto buffer = ov::as_type_ptr(consumer.get_expr())) return buffer->get_cluster_id(); return SIZE_MAX; } diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index d5a8801ffedeac..5e43da6e2bfb86 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -161,12 +161,11 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::snippets::op::HorizonSum) \ OP_EXTENSION(ov::snippets::op::KernelStatic) \ OP_EXTENSION(ov::snippets::op::KernelDynamic) \ - OP_EXTENSION(ov::snippets::op::IntermediateMemoryBuffer) \ OP_EXTENSION(ov::snippets::op::Load) \ OP_EXTENSION(ov::snippets::op::LoadReshape) \ OP_EXTENSION(ov::snippets::op::LoopBegin) \ OP_EXTENSION(ov::snippets::op::LoopEnd) \ - OP_EXTENSION(ov::snippets::op::NewMemoryBuffer) \ + OP_EXTENSION(ov::snippets::op::Buffer) \ OP_EXTENSION(ov::snippets::op::Nop) \ OP_EXTENSION(ov::snippets::op::PowerStatic) \ OP_EXTENSION(ov::snippets::op::Scalar) \ diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index e166fc8bf453e7..8d04c41676b193 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -21,6 +21,7 @@ #include "snippets/lowered/pass/optimize_domain.hpp" #include "snippets/lowered/pass/insert_loops.hpp" #include "snippets/lowered/pass/mark_loops.hpp" +#include "snippets/lowered/pass/insert_buffers.hpp" #include "transformations/defs.hpp" #include "transformations/cpu_opset/common/pass/convert_to_swish_cpu.hpp" #include "transformations/snippets/common/pass/mul_add_to_fma.hpp" @@ -32,7 +33,7 @@ #else #include "emitters/snippets/x64/cpu_generator.hpp" #include "transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp" -#include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp" +#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp" #include "transformations/snippets/x64/pass/remove_converts.hpp" #include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp" #include "transformations/snippets/x64/pass/enforce_precision.hpp" @@ -682,8 +683,8 @@ Subgraph::ControlFlowPasses Subgraph::getControlFlowPasses() const { ov::intel_cpu::pass::FuseLoadStoreConvert); #if defined(OPENVINO_ARCH_X86_64) - SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::intel_cpu::pass::FuseLoadStoreConvert, - ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape); + SNIPPETS_REGISTER_PASS_RELATIVE(Place::Before, ov::snippets::lowered::pass::InsertBuffers, + ov::intel_cpu::pass::InsertBrgemmCopyBBuffers); #endif #ifdef SNIPPETS_LIBXSMM_TPP diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp index cc30edef38086f..dfe4441de90699 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp @@ -137,7 +137,7 @@ std::shared_ptr BrgemmCPU::get_brgemm_copy() const { if (const auto brgemm_copy_b = ov::as_type_ptr(b_input_node)) { return brgemm_copy_b; } - if (ov::is_type(b_input_node)) { + if (ov::is_type(b_input_node)) { if (const auto brgemm_copy_b = ov::as_type_ptr(b_input_node->get_input_node_shared_ptr(0))) { return brgemm_copy_b; } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index e5fac40ac09604..af70218ce0635f 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -77,46 +77,6 @@ size_t get_elems_in_vec(const ov::element::Type& precision) { } namespace repacking { -size_t get_repacking_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr) { - OPENVINO_ASSERT(ov::is_type(copy_b_expr->get_node())); - const auto& in_desc = copy_b_expr->get_input_port_descriptor(0); - const auto& in_layout = in_desc->get_layout(); - const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(copy_b_expr->get_input_port(0)); - - const size_t n_blk = *in_subtensor.rbegin(); - const size_t k_blk = *++in_subtensor.rbegin(); - OPENVINO_ASSERT(!is_dynamic_value(n_blk) && !is_dynamic_value(k_blk), "get_repacking_buffer_size must be called with static subtensor values"); - - const auto& precision = copy_b_expr->get_node()->get_input_element_type(0); - // Repacking buffer shape is set in accordance to OneDNN requirements - const size_t N_dim = std::max(n_blk, compute_inner_n_block(precision)); - if (!in_layout.empty() && in_layout.back() != in_layout.size() - 1) { - // In case of transpose, K dimension must be rounded-up to number of elems in vector register - // For the details, please see 'transpose16x8' and 'fixup16x16' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp - const auto elems_in_vec = brgemm_utils::get_elems_in_vec(precision); - return N_dim * rnd_up(k_blk, elems_in_vec); - } else { - // Low precision repacking writes the result by m_brgemmVNNIFactor * m_inner_n_block blocks - // despite the actual size of the input data. Because of that we have to round-up the allocation shape to always have enough memory allocated. - // For the details, please see 'copy_4x64' and 'copy_2x32' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp - const auto brgemmVNNIFactor = brgemm_utils::compute_vnni_factor(precision); - OPENVINO_ASSERT(brgemmVNNIFactor > 0, "brgemmVNNIFactor value must be positive."); - return N_dim * rnd_up(k_blk, brgemmVNNIFactor); - } -} - -size_t get_compensations_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr) { - OPENVINO_ASSERT(ov::is_type(copy_b_expr->get_node())); - const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(copy_b_expr->get_input_port(0)); - const size_t n_blk = *in_subtensor.rbegin(); - OPENVINO_ASSERT(!is_dynamic_value(n_blk), "get_compensations_buffer_size must be called with static subtensor values"); - const auto& precision = copy_b_expr->get_node()->get_input_element_type(0); - // Compensations are computed during repacking, so we need to round-up allocation shape according to m_inner_n_block - // because of OneDNN implementation nuances (as in get_repacking_buffer_size). - // However, the compensations are computed by N dimension, so K dimension doesn't affect the compensations buffer - return std::max(n_blk, compute_inner_n_block(precision)); -} - size_t compute_out_leading_dim(const size_t n_block, const ov::element::Type& precision) { return std::max(n_block, compute_inner_n_block(precision)); } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index 32d2264822ad57..d0360e45a62e18 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -42,18 +42,6 @@ size_t compute_vnni_factor(const ov::element::Type& precision); size_t get_elems_in_vec(const ov::element::Type& precision); namespace repacking { -/** - * @brief Computes buffer size that OneDNN impl needs for repacked tensor - * @param copy_b_expr Repacking expression whose information (tensor precision, layout, subtensors) is used for - * buffer size computations - */ -size_t get_repacking_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr); -/** - * @brief Computes buffer size that OneDNN impl needs for compensations - * @param copy_b_expr Repacking expression whose information (tensor precision, subtensors) is used for - * buffer size computations - */ -size_t get_compensations_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr); /** * @brief Computes leading dimension (LDB) which must be used in brgemm and brgemm_copy_b emitters * @param n_block N block size shared between BrgemmCPU and BrgemmCopyB node diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp index 3aff94fb7f20f9..6dda47e47326aa 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp @@ -88,7 +88,7 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { set_full_port_desc(output); if (with_amx(brgemm_type)) { - const auto scratch = std::make_shared(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); + const auto scratch = std::make_shared(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); brgemm_cpu = std::make_shared(brgemm->input_value(0), brgemm_repacking->output(0), scratch, brgemm_type, offset_a, offset_b, 0, offset_c, layout_a, std::vector{}, layout_c); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp index a5382f5afed53f..51565537c43568 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp @@ -34,8 +34,8 @@ std::shared_ptr BrgemmCPUBlocking::DummyPass: LinearIR::constExprIt BrgemmCPUBlocking::move_new_memory_buffer(LinearIR& linear_ir, const LinearIR::constExprIt& brgemm_it) { const auto& brgemm_expr = brgemm_it->get(); const auto wsp_expr = brgemm_expr->get_input_port_connector(2)->get_source().get_expr(); - const auto wsp_buffer = ov::as_type_ptr(wsp_expr->get_node()); - OPENVINO_ASSERT(wsp_buffer, "Incorrect Scratchpad buffer for Brgemm AMX"); + const auto wsp_buffer = ov::as_type_ptr(wsp_expr); + OPENVINO_ASSERT(wsp_buffer && wsp_buffer->is_independent_memory(), "Incorrect Scratchpad buffer for Brgemm AMX"); // If scratchpad with temp memory is not explicitly before Brgemm, need to move to there. if (wsp_expr != *std::prev(brgemm_it)) { const auto wsp_it = linear_ir.find(wsp_expr); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.cpp new file mode 100644 index 00000000000000..9d7adab2fdc09b --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_copy_b_buffer_expressions.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/utils/utils.hpp" + +#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" +#include "utils/general_utils.h" + + +using namespace ov::intel_cpu::brgemm_utils::repacking; +using namespace ov::snippets::lowered; + +namespace ov { +namespace intel_cpu { + +RepackedWeightsBufferExpression::RepackedWeightsBufferExpression(const std::shared_ptr& n, + const std::shared_ptr& factory) : BufferExpression(n, factory) {} + +snippets::lowered::ExpressionPtr RepackedWeightsBufferExpression::clone() const { + return std::shared_ptr(new RepackedWeightsBufferExpression(*this)); +} + +void RepackedWeightsBufferExpression::validate() const { + BufferExpression::validate(); + OPENVINO_ASSERT(get_input_count() == 1, "RepackedWeightsBufferExpression must have only one input"); + const auto& parent_out = get_input_port_connector(0)->get_source(); + OPENVINO_ASSERT(ov::is_type(parent_out.get_expr()->get_node()) && parent_out.get_index() == 0, + "RepackedWeightsBufferExpression expects BrgemmCopyB as parent expression"); +} + +void RepackedWeightsBufferExpression::init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) { + const auto& parent_expr = get_input_port_connector(0)->get_source().get_expr(); + const auto& in_layout = parent_expr->get_input_port_descriptor(0)->get_layout(); + const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(parent_expr->get_input_port(0)); + + const size_t n_blk = *in_subtensor.rbegin(); + const size_t k_blk = *++in_subtensor.rbegin(); + OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(n_blk) && !ov::snippets::utils::is_dynamic_value(k_blk), + "RepackedWeightsBufferExpression supports only static subtensor values"); + + const auto& precision = get_node()->get_input_element_type(0); + // Repacking buffer shape is set in accordance to OneDNN requirements + const size_t N_dim = std::max(n_blk, compute_inner_n_block(precision)); + if (!in_layout.empty() && in_layout.back() != in_layout.size() - 1) { + // In case of transpose, K dimension must be rounded-up to number of elems in vector register + // For the details, please see 'transpose16x8' and 'fixup16x16' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp + const auto elems_in_vec = brgemm_utils::get_elems_in_vec(precision); + m_allocation_size = N_dim * rnd_up(k_blk, elems_in_vec); + } else { + // Low precision repacking writes the result by m_brgemmVNNIFactor * m_inner_n_block blocks + // despite the actual size of the input data. Because of that we have to round-up the allocation shape to always have enough memory allocated. + // For the details, please see 'copy_4x64' and 'copy_2x32' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp + const auto brgemmVNNIFactor = brgemm_utils::compute_vnni_factor(precision); + OPENVINO_ASSERT(brgemmVNNIFactor > 0, "brgemmVNNIFactor value must be positive."); + m_allocation_size = N_dim * rnd_up(k_blk, brgemmVNNIFactor); + } +} + +CompensationsBufferExpression::CompensationsBufferExpression(const std::shared_ptr& n, + const std::shared_ptr& factory) : BufferExpression(n, factory) {} + +snippets::lowered::ExpressionPtr CompensationsBufferExpression::clone() const { + return std::shared_ptr(new CompensationsBufferExpression(*this)); +} + +void CompensationsBufferExpression::validate() const { + BufferExpression::validate(); + OPENVINO_ASSERT(get_input_count() == 1, "CompensationsBufferExpression must have only one input"); + const auto& parent_out = get_input_port_connector(0)->get_source(); + OPENVINO_ASSERT(ov::is_type(parent_out.get_expr()->get_node()) && parent_out.get_index() == 1, + "CompensationsBufferExpression expects BrgemmCopyB as parent expression"); +} + +void CompensationsBufferExpression::init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) { + const auto& parent_expr = get_input_port_connector(0)->get_source().get_expr(); + const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(parent_expr->get_input_port(0)); + const size_t n_blk = *in_subtensor.rbegin(); + OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(n_blk), "CompensationsBufferExpression supports only static subtensor values"); + const auto& precision = parent_expr->get_node()->get_input_element_type(0); + // Compensations are computed during repacking, so we need to round-up allocation shape according to m_inner_n_block + // because of OneDNN implementation nuances (as in get_repacking_buffer_size). + // However, the compensations are computed by N dimension, so K dimension doesn't affect the compensations buffer + m_allocation_size = std::max(n_blk, compute_inner_n_block(precision)); +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.hpp new file mode 100644 index 00000000000000..b85e75c55da30b --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/expressions/buffer_expression.hpp" + +namespace ov { +namespace intel_cpu { + +class RepackedWeightsBufferExpression : public snippets::lowered::BufferExpression { + friend class snippets::lowered::ExpressionFactory; +public: + OPENVINO_RTTI("RepackedWeightsBufferExpression", "0", BufferExpression) + RepackedWeightsBufferExpression() = default; + + void validate() const override; + void init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) override; + +private: + RepackedWeightsBufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory); + + snippets::lowered::ExpressionPtr clone() const override; +}; + +class CompensationsBufferExpression : public snippets::lowered::BufferExpression { + friend class snippets::lowered::ExpressionFactory; +public: + OPENVINO_RTTI("CompensationsBufferExpression", "0", BufferExpression) + CompensationsBufferExpression() = default; + + void validate() const override; + void init_allocation_size(const std::shared_ptr& loop_manager, size_t allocation_rank) override; + +private: + CompensationsBufferExpression(const std::shared_ptr& n, const std::shared_ptr& factory); + + snippets::lowered::ExpressionPtr clone() const override; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp new file mode 100644 index 00000000000000..bd8dd12bd39256 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp @@ -0,0 +1,60 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "insert_brgemm_copy_b_buffers.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/itt.hpp" + +#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" +#include "expressions/brgemm_copy_b_buffer_expressions.hpp" + + +using namespace ov::intel_cpu::brgemm_utils::repacking; +using namespace ov::snippets::lowered; + +namespace ov { +namespace intel_cpu { +namespace pass { + +bool InsertBrgemmCopyBBuffers::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertBrgemmCopyBBuffers") + + const auto& factory = linear_ir.get_expr_factory(); + + auto insert_buffer = [&](const ExpressionPtr& copy_b_expr, size_t out_port, LinearIR::constExprIt insertion_pos) { + const auto& copy_b = ov::as_type_ptr(copy_b_expr->get_node()); + const auto& copy_b_out = copy_b_expr->get_output_port_connector(out_port); + const auto copy_b_consumers = copy_b_out->get_consumers(); + OPENVINO_ASSERT(copy_b_consumers.size() == 1, "BufferCopyB must have only one consumer on each out port - Brgemm"); + const auto& buffer_op = std::make_shared(copy_b->output(out_port)); + BufferExpressionPtr buffer_expr = nullptr; + if (out_port == 0) { + buffer_expr = factory->build(buffer_op, {copy_b_out}); + } else if (out_port == 1 && with_compensations(copy_b->get_type())) { + buffer_expr = factory->build(buffer_op, {copy_b_out}); + } else { + OPENVINO_THROW("BrgemmCopyB has incorrect output ports"); + } + return linear_ir.insert_expr(buffer_expr, LoopManager::get_common_outer_loops(copy_b_expr, copy_b_consumers.begin()->get_expr()), + true, insertion_pos, {copy_b_consumers}); + }; + + bool modified = false; + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto expr = *expr_it; + if (auto copy_b = ov::as_type_ptr(expr->get_node())) { + for (size_t i = 0; i < expr->get_output_count(); ++i) { + expr_it = insert_buffer(expr, i, std::next(expr_it)); + } + modified = true; + } + } + return modified; +} + +} // namespace pass +} // namespace intel_cpu +} // namespace ov + diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp new file mode 100644 index 00000000000000..a08bc507aa60da --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/pass/pass.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { + +/** + * @interface InsertBrgemmCopyBBuffers + * @brief Insert Buffers after BrgemmCopyB with algorithm of allocation size calculation which + * distinguishes with common algorithm + * @ingroup snippets + */ +class InsertBrgemmCopyBBuffers: public snippets::lowered::pass::RangedPass { +public: + InsertBrgemmCopyBBuffers() = default; + OPENVINO_RTTI("InsertBrgemmCopyBBuffers", "Pass"); + bool run(snippets::lowered::LinearIR& linear_ir, snippets::lowered::LinearIR::constExprIt begin, snippets::lowered::LinearIR::constExprIt end) override; +}; + +} // namespace pass +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp deleted file mode 100644 index 332c0cccaf4acc..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/itt.hpp" - -#include "set_brgemm_copy_b_buffers_shape.hpp" -#include "snippets/snippets_isa.hpp" -#include "snippets/utils/utils.hpp" - -#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" -#include "transformations/snippets/x64/op/brgemm_utils.hpp" - -using namespace ov::intel_cpu::brgemm_utils::repacking; - -bool ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape::run(snippets::lowered::LinearIR& linear_ir, - snippets::lowered::LinearIR::constExprIt begin, - snippets::lowered::LinearIR::constExprIt end) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SetBrgemmCopyBBuffersShape") - - auto get_buffer_from_output = [](const snippets::lowered::ExpressionPtr& expr, const size_t out_idx) { - const auto& consumers = expr->get_output_port_connector(out_idx)->get_consumers(); - OPENVINO_ASSERT(consumers.size() == 1, "BrgemmCopyB must have only 1 consumer"); - const auto buffer = ov::as_type_ptr(consumers.begin()->get_expr()->get_node()); - OPENVINO_ASSERT(buffer, "BrgemmCopyB consumer must be Buffer"); - return buffer; - }; - - bool modified = false; - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto& expr = *expr_it; - if (auto copy_b = ov::as_type_ptr(expr->get_node())) { - const auto buffer = get_buffer_from_output(expr, 0); - buffer->set_allocation_size(get_repacking_buffer_size(expr)); - if (with_compensations(copy_b->get_type())) { - const auto compensations_buffer = get_buffer_from_output(expr, 1); - compensations_buffer->set_allocation_size(get_compensations_buffer_size(expr)); - } - modified = true; - } - } - return modified; -} diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp deleted file mode 100644 index 1b348ecbf2740c..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "snippets/lowered/pass/pass.hpp" - -namespace ov { -namespace intel_cpu { -namespace pass { - -/** - * @interface SetBrgemmCopyBBuffersShape - * @brief Sets the allocation shape for the Buffers after BrgemmCopyB node using BrgemmCopyB parameters - * This pass may be deprecated when a more generic memory management approach is introduced. - * Ticket: 113744 - * @ingroup snippets - */ -class SetBrgemmCopyBBuffersShape: public snippets::lowered::pass::RangedPass { -public: - SetBrgemmCopyBBuffersShape() = default; - OPENVINO_RTTI("SetBrgemmCopyBBuffersShape", "Pass"); - bool run(snippets::lowered::LinearIR& linear_ir, - snippets::lowered::LinearIR::constExprIt begin, - snippets::lowered::LinearIR::constExprIt end) override; -}; - -} // namespace pass -} // namespace intel_cpu -} // namespace ov diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp index 4be2638e28b893..89f2e06c14a9fa 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp @@ -310,7 +310,7 @@ TEST_F(BrgemmCPUBlockingTest, AMX) { { auto data_a = linear_ir->push_node(precision, input_shape_a); auto data_b = linear_ir->push_node(precision, input_shape_b); - auto scratch = linear_ir->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); + auto scratch = linear_ir->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); auto copy_b = linear_ir->push_node(data_b.second, precision, BRGEMM_TYPE::REPACKING_ONLY); init_expr_descriptors(*copy_b.first); auto brgemm = linear_ir->push_node(data_a.second, copy_b.second, scratch.second, BRGEMM_TYPE::WITH_AMX); @@ -324,7 +324,7 @@ TEST_F(BrgemmCPUBlockingTest, AMX) { const auto copy_b_expr = *copy_b.first; init_expr_descriptors(copy_b_expr, {{full_dim, full_dim}, {full_dim, full_dim}}); - auto scratch = linear_ir_ref->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); + auto scratch = linear_ir_ref->push_node(ov::Shape{BrgemmCPU::SCRATCH_BYTE_SIZE}); scratch.first->get()->set_loop_ids({0}); auto brgemm = linear_ir_ref->push_node(data_a.second, copy_b.second, scratch.second, BRGEMM_TYPE::WITH_AMX); diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp index 5434ff228aa833..6dad1d4772f531 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp @@ -17,7 +17,7 @@ #include "transformations/snippets/x64/shape_inference.hpp" #include "transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp" -#include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp" +#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" @@ -85,22 +85,20 @@ class BufferAllocationCPUTest : public testing::TestWithParam(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); + pipeline.register_pass(); pipeline.register_pass(); pipeline.register_pass(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(); pipeline.register_pass(m_is_buffer_optimized); pipeline.run(m_linear_ir); } void Validate() { std::set reg_groups, clusters; - for (const auto& expr : m_linear_ir) { - if (const auto buffer = ov::as_type_ptr(expr->get_node())) { - reg_groups.insert(buffer->get_reg_group()); - clusters.insert(buffer->get_cluster_id()); - } + for (const auto& buffer : m_linear_ir.get_buffers()) { + reg_groups.insert(buffer->get_reg_group()); + clusters.insert(buffer->get_cluster_id()); } EXPECT_EQ(reg_groups.size(), m_expected_reg_group_count); EXPECT_EQ(clusters.size(), m_expected_cluster_count); @@ -211,7 +209,7 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto convert1 = std::make_shared(relu0, ov::element::bf16); const auto brgemm_copyb0 = std::make_shared(convert1, ov::element::bf16); - const auto scratch0 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); + const auto scratch0 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); const auto brgemm_cpu0 = std::make_shared( parameter0, brgemm_copyb0->output(0), scratch0, BRGEMM_TYPE::WITH_AMX); @@ -231,7 +229,7 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto convert2 = std::make_shared(multiply, ov::element::bf16); const auto brgemm_copyb1 = std::make_shared(parameter2, ov::element::bf16); - const auto scratch1 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); + const auto scratch1 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); const auto brgemm_cpu1 = std::make_shared( convert2, brgemm_copyb1->output(0), scratch1, BRGEMM_TYPE::WITH_AMX);