Skip to content

Commit

Permalink
[ARM CPU] Add ACL FC executor for FP32/FP16 precision (#24123)
Browse files Browse the repository at this point in the history
  • Loading branch information
allnes authored Aug 13, 2024
1 parent c1e795c commit 8d1cd4e
Show file tree
Hide file tree
Showing 16 changed files with 969 additions and 127 deletions.
134 changes: 134 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_common_executor.hpp"
#include "acl_utils.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {

static const std::unordered_map<int, ACLArgs> argConvert = {
{ARG_SRC_0, ACL_SRC_0},
{ARG_SRC_1, ACL_SRC_1},
{ARG_SRC_2, ACL_SRC_2},
{ARG_BIAS, ACL_BIAS},
{ARG_WEI, ACL_WEI},
{ARG_DST, ACL_DST},
};

using ACLTypes = std::array<arm_compute::DataType, ACLArgs::COUNT_OF_ARGS>;
using ACLLayouts = std::array<arm_compute::DataLayout, ACLArgs::COUNT_OF_ARGS>;

static void initACLTensorParams(const MemoryPtr& memoryPtr,
const ACLTensorAttrs& attrs,
arm_compute::TensorShape& tensorShape,
arm_compute::DataType& dataType,
arm_compute::DataLayout& dataLayout) {
dataType = precisionToAclDataType(memoryPtr->getPrecision());
dataLayout = getAclDataLayoutByMemoryDesc(memoryPtr->getDescPtr());
if (dataType != arm_compute::DataType::UNKNOWN) {
auto collapsed_dims = collapse_dims_to_max_rank(memoryPtr->getStaticDims(), attrs.maxDimsShape);
tensorShape = shapeCast(collapsed_dims);
if (attrs.hasLayoutTypeNHWC) {
changeLayoutToNH_C({&tensorShape});
}
}
}

static std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) {
std::shared_ptr<arm_compute::TensorInfo> aclMemoryInfo = nullptr;
if (dataType != arm_compute::DataType::UNKNOWN) {
aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>(
tensorShape, 1,
dataType,
dataLayout);
}
return aclMemoryInfo;
}

static std::shared_ptr<arm_compute::Tensor> initTensor(const std::shared_ptr<arm_compute::TensorInfo>& aclMemoryInfo) {
std::shared_ptr<arm_compute::Tensor> aclMemory = nullptr;
if (aclMemoryInfo) {
aclMemory = std::make_shared<arm_compute::Tensor>();
aclMemory->allocator()->init(*aclMemoryInfo);
}
return aclMemory;
}

ACLCommonExecutor::ACLCommonExecutor() {
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; ++i) {
aclTensorAttrs.memoryUsageIndicator[i] = false;
}
}

bool ACLCommonExecutor::update(const MemoryArgs &memory) {
// Initialize ACL tensors params
ACLShapes aclMemoryShapes;
ACLTypes aclDataType{};
ACLLayouts aclDataLayout{};
for (auto& cpu_mem_ptr : memory) {
const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs,
aclMemoryShapes[index],
aclDataType[index],
aclDataLayout[index]);
}

// Update ACL tensors shapes
updateTensorsShapes(aclMemoryShapes);

// Initialize arm_compute::TensorInfo objects
ACLInfos aclMemoryInfos;
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
aclMemoryInfos[i] = initTensorInfo(aclMemoryShapes[i], aclDataType[i], aclDataLayout[i]);
}

// Validate arm_compute::TensorInfo objects for specific ACL function
auto tensorsInfoValidateStatus = validateTensorsInfo(aclMemoryInfos);
if (!tensorsInfoValidateStatus) {
DEBUG_LOG("ACL operator validation failed: ", tensorsInfoValidateStatus.error_description());
return false;
}

// Initialize arm_compute::Tensor objects
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
aclMemoryTensors[i] = initTensor(aclMemoryInfos[i]);
// Indicate that arm_compute::Tensor object can use import_memory function
if (aclMemoryTensors[i]) {
aclTensorAttrs.memoryUsageIndicator[i] = true;
}
}

// Configure arm_compute::IFunction object
configureThreadSafe([&] {
iFunction = configureFunction(aclMemoryTensors);
});
return true;
}

void ACLCommonExecutor::execute(const MemoryArgs &memory) {
// TODO: Move import_memory() to update() function - CVS-145871
for (auto& cpu_mem_ptr : memory) {
const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
if (aclTensorAttrs.memoryUsageIndicator[index]) {
aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData());
}
}
iFunction->run();
}

ACLCommonExecutor::~ACLCommonExecutor() {
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
if (aclTensorAttrs.memoryUsageIndicator[i]) {
aclMemoryTensors[i]->allocator()->free();
}
}
}

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "cpu_memory.h"
#include "nodes/executors/executor.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"

namespace ov {
namespace intel_cpu {

enum ACLArgs {
ACL_SRC_0,
ACL_SRC_1,
ACL_SRC_2,
ACL_BIAS,
ACL_WEI,
ACL_DST,
COUNT_OF_ARGS
};

using ACLFunction = std::unique_ptr<arm_compute::IFunction>;
using ACLShapes = std::array<arm_compute::TensorShape, ACLArgs::COUNT_OF_ARGS>;
using ACLInfos = std::array<std::shared_ptr<arm_compute::TensorInfo>, ACLArgs::COUNT_OF_ARGS>;
using ACLTensors = std::array<std::shared_ptr<arm_compute::Tensor>, ACLArgs::COUNT_OF_ARGS>;

struct ACLTensorAttrs {
bool hasLayoutTypeNHWC = false;
size_t maxDimsShape = arm_compute::MAX_DIMS;
std::array<bool, ACLArgs::COUNT_OF_ARGS> memoryUsageIndicator;
};

class ACLCommonExecutor : public Executor {
public:
ACLCommonExecutor();
virtual void updateTensorsShapes(ACLShapes& aclMemoryShapes) = 0;
virtual arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) = 0;
virtual ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) = 0;
impl_desc_type implType() const override {
return impl_desc_type::acl;
}
void execute(const MemoryArgs& memory) override;
bool update(const MemoryArgs& memory) override;
~ACLCommonExecutor();

protected:
ACLTensorAttrs aclTensorAttrs;
private:
ACLTensors aclMemoryTensors;
ACLFunction iFunction = nullptr;
};

using ACLCommonExecutorPtr = std::shared_ptr<ACLCommonExecutor>;

} // namespace intel_cpu
} // namespace ov
119 changes: 17 additions & 102 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,66 +361,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseRelu:
if (aclEltwiseAttrs.alpha == 0) {
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
ActivationLayerInfo::ActivationFunction::RELU))
return false;
} else {
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha}))
return false;
}
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
if (aclEltwiseAttrs.alpha == 0) {
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::RELU);
} else {
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha});
}
return acl_op;
};
break;
case Algorithm::EltwiseGeluErf:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::GELU))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::GELU);
return acl_op;
};
break;
case Algorithm::EltwiseElu:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha});
return acl_op;
};
break;
case Algorithm::EltwiseTanh:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f});
return acl_op;
};
break;
case Algorithm::EltwiseSigmoid:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::LOGISTIC))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::LOGISTIC);
return acl_op;
};
break;
case Algorithm::EltwiseAbs:
if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
return false;
Expand All @@ -430,24 +370,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseSqrt:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SQRT))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SQRT);
return acl_op;
};
break;
case Algorithm::EltwiseSoftRelu:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU);
return acl_op;
};
break;
case Algorithm::EltwiseExp:
if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
return false;
Expand All @@ -457,28 +379,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseClamp:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha});
return acl_op;
};
break;
case Algorithm::EltwiseSwish:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha});
return acl_op;
};
break;
case Algorithm::EltwisePrelu:
if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
return false;
Expand All @@ -488,12 +388,27 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseRelu:
case Algorithm::EltwiseGeluErf:
case Algorithm::EltwiseElu:
case Algorithm::EltwiseTanh:
case Algorithm::EltwiseSigmoid:
case Algorithm::EltwiseSqrt:
case Algorithm::EltwiseSoftRelu:
case Algorithm::EltwiseClamp:
case Algorithm::EltwiseSwish:
case Algorithm::EltwiseHswish:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH))
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm,
aclEltwiseAttrs.alpha,
aclEltwiseAttrs.beta,
aclEltwiseAttrs.gamma)))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH);
acl_op->configure(&srcTensors[0], &dstTensors[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm,
aclEltwiseAttrs.alpha,
aclEltwiseAttrs.beta,
aclEltwiseAttrs.gamma));
return acl_op;
};
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class AclEltwiseExecutor : public EltwiseExecutor {
explicit AclEltwiseExecutor(const ExecutorContext::CPtr context);
static bool isEltwiseAlgorithmSupported(Algorithm algorithm);

bool init(const EltwiseAttrs& eltwiseAttrs,
bool init(const EltwiseAttrs& attrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const std::vector<EltwisePostOp>& postOps) override;
Expand Down
Loading

0 comments on commit 8d1cd4e

Please sign in to comment.