[ARM CPU] Add ACL FC executor for FP32/FP16 precision (#24123)

CVS-138509 CVS-137575 CVS-147625 CVS-148130
openvinotoolkit · Aug 13, 2024 · 8d1cd4e · 8d1cd4e
1 parent c1e795c
commit 8d1cd4e
Show file tree

Hide file tree

Showing 16 changed files with 969 additions and 127 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
@@ -0,0 +1,134 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_common_executor.hpp"
+#include "acl_utils.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+
+namespace ov {
+namespace intel_cpu {
+
+static const std::unordered_map<int, ACLArgs> argConvert = {
+    {ARG_SRC_0, ACL_SRC_0},
+    {ARG_SRC_1, ACL_SRC_1},
+    {ARG_SRC_2, ACL_SRC_2},
+    {ARG_BIAS,  ACL_BIAS},
+    {ARG_WEI,   ACL_WEI},
+    {ARG_DST,   ACL_DST},
+};
+
+using ACLTypes   = std::array<arm_compute::DataType,   ACLArgs::COUNT_OF_ARGS>;
+using ACLLayouts = std::array<arm_compute::DataLayout, ACLArgs::COUNT_OF_ARGS>;
+
+static void initACLTensorParams(const MemoryPtr& memoryPtr,
+                                const ACLTensorAttrs& attrs,
+                                arm_compute::TensorShape& tensorShape,
+                                arm_compute::DataType& dataType,
+                                arm_compute::DataLayout& dataLayout) {
+    dataType = precisionToAclDataType(memoryPtr->getPrecision());
+    dataLayout = getAclDataLayoutByMemoryDesc(memoryPtr->getDescPtr());
+    if (dataType != arm_compute::DataType::UNKNOWN) {
+        auto collapsed_dims = collapse_dims_to_max_rank(memoryPtr->getStaticDims(), attrs.maxDimsShape);
+        tensorShape = shapeCast(collapsed_dims);
+        if (attrs.hasLayoutTypeNHWC) {
+            changeLayoutToNH_C({&tensorShape});
+        }
+    }
+}
+
+static std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
+                              const arm_compute::DataType& dataType,
+                              const arm_compute::DataLayout& dataLayout) {
+    std::shared_ptr<arm_compute::TensorInfo> aclMemoryInfo = nullptr;
+    if (dataType != arm_compute::DataType::UNKNOWN) {
+        aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>(
+                tensorShape, 1,
+                dataType,
+                dataLayout);
+    }
+    return aclMemoryInfo;
+}
+
+static std::shared_ptr<arm_compute::Tensor> initTensor(const std::shared_ptr<arm_compute::TensorInfo>& aclMemoryInfo) {
+    std::shared_ptr<arm_compute::Tensor> aclMemory = nullptr;
+    if (aclMemoryInfo) {
+        aclMemory = std::make_shared<arm_compute::Tensor>();
+        aclMemory->allocator()->init(*aclMemoryInfo);
+    }
+    return aclMemory;
+}
+
+ACLCommonExecutor::ACLCommonExecutor() {
+    for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; ++i) {
+        aclTensorAttrs.memoryUsageIndicator[i] = false;
+    }
+}
+
+bool ACLCommonExecutor::update(const MemoryArgs &memory) {
+    // Initialize ACL tensors params
+    ACLShapes  aclMemoryShapes;
+    ACLTypes   aclDataType{};
+    ACLLayouts aclDataLayout{};
+    for (auto& cpu_mem_ptr : memory) {
+        const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
+        initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs,
+                            aclMemoryShapes[index],
+                            aclDataType[index],
+                            aclDataLayout[index]);
+    }
+
+    // Update ACL tensors shapes
+    updateTensorsShapes(aclMemoryShapes);
+
+    // Initialize arm_compute::TensorInfo objects
+    ACLInfos aclMemoryInfos;
+    for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
+        aclMemoryInfos[i] = initTensorInfo(aclMemoryShapes[i], aclDataType[i], aclDataLayout[i]);
+    }
+
+    // Validate arm_compute::TensorInfo objects for specific ACL function
+    auto tensorsInfoValidateStatus = validateTensorsInfo(aclMemoryInfos);
+    if (!tensorsInfoValidateStatus) {
+        DEBUG_LOG("ACL operator validation failed: ", tensorsInfoValidateStatus.error_description());
+        return false;
+    }
+
+    // Initialize arm_compute::Tensor objects
+    for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
+        aclMemoryTensors[i] = initTensor(aclMemoryInfos[i]);
+        // Indicate that arm_compute::Tensor object can use import_memory function
+        if (aclMemoryTensors[i]) {
+            aclTensorAttrs.memoryUsageIndicator[i] = true;
+        }
+    }
+
+    // Configure arm_compute::IFunction object
+    configureThreadSafe([&] {
+        iFunction = configureFunction(aclMemoryTensors);
+    });
+    return true;
+}
+
+void ACLCommonExecutor::execute(const MemoryArgs &memory) {
+    // TODO: Move import_memory() to update() function - CVS-145871
+    for (auto& cpu_mem_ptr : memory) {
+        const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
+        if (aclTensorAttrs.memoryUsageIndicator[index]) {
+            aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData());
+        }
+    }
+    iFunction->run();
+}
+
+ACLCommonExecutor::~ACLCommonExecutor() {
+    for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
+        if (aclTensorAttrs.memoryUsageIndicator[i]) {
+            aclMemoryTensors[i]->allocator()->free();
+        }
+    }
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "cpu_memory.h"
+#include "nodes/executors/executor.hpp"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+namespace ov {
+namespace intel_cpu {
+
+enum ACLArgs {
+    ACL_SRC_0,
+    ACL_SRC_1,
+    ACL_SRC_2,
+    ACL_BIAS,
+    ACL_WEI,
+    ACL_DST,
+    COUNT_OF_ARGS
+};
+
+using ACLFunction = std::unique_ptr<arm_compute::IFunction>;
+using ACLShapes   = std::array<arm_compute::TensorShape, ACLArgs::COUNT_OF_ARGS>;
+using ACLInfos    = std::array<std::shared_ptr<arm_compute::TensorInfo>, ACLArgs::COUNT_OF_ARGS>;
+using ACLTensors  = std::array<std::shared_ptr<arm_compute::Tensor>, ACLArgs::COUNT_OF_ARGS>;
+
+struct ACLTensorAttrs {
+    bool hasLayoutTypeNHWC = false;
+    size_t maxDimsShape = arm_compute::MAX_DIMS;
+    std::array<bool, ACLArgs::COUNT_OF_ARGS> memoryUsageIndicator;
+};
+
+class ACLCommonExecutor : public Executor {
+public:
+    ACLCommonExecutor();
+    virtual void updateTensorsShapes(ACLShapes& aclMemoryShapes) = 0;
+    virtual arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) = 0;
+    virtual ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) = 0;
+    impl_desc_type implType() const override {
+        return impl_desc_type::acl;
+    }
+    void execute(const MemoryArgs& memory) override;
+    bool update(const MemoryArgs& memory) override;
+    ~ACLCommonExecutor();
+
+protected:
+    ACLTensorAttrs aclTensorAttrs;
+private:
+    ACLTensors aclMemoryTensors;
+    ACLFunction iFunction = nullptr;
+};
+
+using ACLCommonExecutorPtr = std::shared_ptr<ACLCommonExecutor>;
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
@@ -361,66 +361,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
-        case Algorithm::EltwiseRelu:
-            if (aclEltwiseAttrs.alpha == 0) {
-                if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                                 ActivationLayerInfo::ActivationFunction::RELU))
-                    return false;
-            } else {
-                if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                                 {ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha}))
-                    return false;
-            }
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                if (aclEltwiseAttrs.alpha == 0) {
-                    acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::RELU);
-                } else {
-                    acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                      {ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha});
-                }
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseGeluErf:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::GELU))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::GELU);
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseElu:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha});
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseTanh:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                  {ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f});
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseSigmoid:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::LOGISTIC))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::LOGISTIC);
-                return acl_op;
-            };
-            break;
         case Algorithm::EltwiseAbs:
             if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
                 return false;
@@ -430,24 +370,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
-        case Algorithm::EltwiseSqrt:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SQRT))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SQRT);
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseSoftRelu:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU);
-                return acl_op;
-            };
-            break;
         case Algorithm::EltwiseExp:
             if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
                 return false;
@@ -457,28 +379,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
-        case Algorithm::EltwiseClamp:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                  {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha});
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseSwish:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                  {ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha});
-                return acl_op;
-            };
-            break;
         case Algorithm::EltwisePrelu:
             if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
                 return false;
@@ -488,12 +388,27 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
+        case Algorithm::EltwiseRelu:
+        case Algorithm::EltwiseGeluErf:
+        case Algorithm::EltwiseElu:
+        case Algorithm::EltwiseTanh:
+        case Algorithm::EltwiseSigmoid:
+        case Algorithm::EltwiseSqrt:
+        case Algorithm::EltwiseSoftRelu:
+        case Algorithm::EltwiseClamp:
+        case Algorithm::EltwiseSwish:
         case Algorithm::EltwiseHswish:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH))
+            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm,
+                                                                                                            aclEltwiseAttrs.alpha,
+                                                                                                            aclEltwiseAttrs.beta,
+                                                                                                            aclEltwiseAttrs.gamma)))
                 return false;
             exec_func = [this]() -> std::unique_ptr<IFunction> {
                 auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH);
+                acl_op->configure(&srcTensors[0], &dstTensors[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm,
+                                                                                         aclEltwiseAttrs.alpha,
+                                                                                         aclEltwiseAttrs.beta,
+                                                                                         aclEltwiseAttrs.gamma));
                 return acl_op;
             };
             break;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp
@@ -16,7 +16,7 @@ class AclEltwiseExecutor : public EltwiseExecutor {
     explicit AclEltwiseExecutor(const ExecutorContext::CPtr context);
     static bool isEltwiseAlgorithmSupported(Algorithm algorithm);
 
-    bool init(const EltwiseAttrs& eltwiseAttrs,
+    bool init(const EltwiseAttrs& attrs,
               const std::vector<MemoryDescPtr>& srcDescs,
               const std::vector<MemoryDescPtr>& dstDescs,
               const std::vector<EltwisePostOp>& postOps) override;