From 7072212482d639dce245d0305675607d581729b6 Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Thu, 31 Oct 2024 11:35:22 +0000
Subject: [PATCH] 2024-10-31 nightly release
 (f813b6a9ab3edb638957f0a2e054da804e087980)

---
 .github/workflows/_android.yml                |  10 +
 .github/workflows/pull.yml                    |   7 +-
 .../coreml/compiler/coreml_preprocess.py      |   5 +-
 .../runtime/delegate/backend_delegate.h       |   2 +-
 .../com.apple.executorchcoreml_config.plist   |   2 +-
 .../test/CoreMLBackendDelegateTests.mm        |   6 +
 .../project.pbxproj                           |   4 +
 .../coreml/scripts/generate_test_models.sh    |   9 +
 backends/arm/_passes/arm_pass_manager.py      |   2 +
 backends/arm/_passes/arm_pass_utils.py        |  47 ++
 backends/arm/_passes/conv1d_unsqueeze_pass.py | 164 +++++
 .../arm/_passes/convert_split_to_slice.py     |   1 +
 backends/arm/_passes/tag_io_quant_pass.py     |   6 +-
 backends/arm/test/ops/test_conv1d.py          | 298 ++++++++
 .../test/ops/{test_conv.py => test_conv2d.py} |   6 +-
 backends/arm/test/ops/test_depthwise_conv.py  | 109 ++-
 .../arm/test/passes/test_tag_io_quant_pass.py |  18 +-
 backends/arm/test/runner_utils.py             |   2 +-
 backends/cadence/aot/functions_hifi.yaml      |  17 +-
 backends/cadence/cadence.cmake                |   3 +
 backends/cadence/hifi/kernels/CMakeLists.txt  |   4 +
 backends/cadence/hifi/kernels/kernels.h       |  43 ++
 .../cadence/hifi/operators/CMakeLists.txt     |  23 +-
 backends/cadence/hifi/operators/op_add.cpp    | 206 ++++++
 backends/cadence/hifi/operators/op_div.cpp    | 288 ++++++++
 backends/cadence/hifi/operators/op_mul.cpp    | 169 +++++
 .../cadence/hifi/operators/op_sigmoid.cpp     |  82 +++
 backends/cadence/hifi/operators/op_sub.cpp    | 203 ++++++
 backends/cadence/hifi/operators/op_tanh.cpp   |  44 ++
 .../nnlib/xa_nn_elm_add_f32_broadcast.c       | 428 ++++++++++++
 .../nnlib/xa_nn_elm_div_f32_broadcast.c       | 419 ++++++++++++
 .../nnlib/xa_nn_elm_div_mode_f32_broadcast.c  | 644 ++++++++++++++++++
 .../nnlib/xa_nn_elm_mul_f32_broadcast.c       | 360 ++++++++++
 .../vulkan/runtime/api/containers/Tensor.cpp  |  15 -
 .../vulkan/runtime/api/containers/Tensor.h    |  20 +-
 backends/vulkan/runtime/graph/ComputeGraph.h  |   8 +-
 .../graph/ops/glsl/addmm_naive_texture3d.glsl |  28 +-
 .../graph/ops/glsl/addmm_optimized.glsl       |  22 +-
 .../runtime/graph/ops/glsl/binary_op.glsl     |  19 +-
 .../bitw8_image_to_nchw_nobitw8buffer.glsl    |   5 +-
 .../vulkan/runtime/graph/ops/glsl/conv1d.glsl |  21 +-
 .../graph/ops/glsl/copy_channel_offset.glsl   |  24 +-
 .../runtime/graph/ops/glsl/copy_offset.glsl   |  12 +-
 .../runtime/graph/ops/glsl/embedding.glsl     |  17 +-
 .../runtime/graph/ops/glsl/image_to_nchw.glsl |   9 +-
 .../runtime/graph/ops/glsl/indexing_utils.h   |  20 +-
 .../graph/ops/glsl/native_layer_norm.glsl     |  12 +-
 .../nchw_to_bitw8_image_nobitw8buffer.glsl    |   5 +-
 .../graph/ops/glsl/nchw_to_buffer.glsl        |   2 +-
 .../runtime/graph/ops/glsl/nchw_to_image.glsl |  19 +-
 .../graph/ops/glsl/repeat_interleave.glsl     |  14 +-
 .../runtime/graph/ops/impl/BinaryOp.cpp       |   5 +-
 .../runtime/graph/ops/impl/Convolution.cpp    |  13 +-
 .../vulkan/runtime/graph/ops/impl/Copy.cpp    |  12 +-
 .../runtime/graph/ops/impl/Embedding.cpp      |  11 +-
 .../vulkan/runtime/graph/ops/impl/Linear.cpp  |  21 +-
 .../vulkan/runtime/graph/ops/impl/MatMul.cpp  |  16 +-
 .../graph/ops/impl/NativeLayerNorm.cpp        |   6 +-
 .../graph/ops/impl/RepeatInterleave.cpp       |   9 +-
 .../vulkan/runtime/graph/ops/impl/Staging.cpp |  18 +-
 .../graph/ops/impl/utils/TensorUtils.h        |  14 -
 backends/vulkan/test/utils/test_utils.cpp     |  20 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   |  12 +-
 build/run_android_emulator.sh                 |   9 +-
 .../executorch-arm-delegate-tutorial.md       | 194 +++---
 examples/apple/coreml/scripts/export.py       |   3 +-
 .../oss_scripts/llama2/runner/runner.cpp      |   2 +-
 exir/pass_base.py                             |   2 +-
 runtime/executor/method.cpp                   |  69 +-
 runtime/executor/tensor_parser.h              |  17 +-
 runtime/executor/tensor_parser_exec_aten.cpp  |  15 +-
 runtime/platform/assert.h                     |  12 +-
 runtime/platform/default/posix.cpp            |   4 +-
 runtime/platform/log.h                        |   5 +-
 runtime/platform/profiler.h                   |  29 +-
 75 files changed, 3987 insertions(+), 434 deletions(-)
 create mode 100644 backends/arm/_passes/conv1d_unsqueeze_pass.py
 create mode 100644 backends/arm/test/ops/test_conv1d.py
 rename backends/arm/test/ops/{test_conv.py => test_conv2d.py} (98%)
 create mode 100644 backends/cadence/hifi/operators/op_add.cpp
 create mode 100644 backends/cadence/hifi/operators/op_div.cpp
 create mode 100644 backends/cadence/hifi/operators/op_mul.cpp
 create mode 100644 backends/cadence/hifi/operators/op_sigmoid.cpp
 create mode 100644 backends/cadence/hifi/operators/op_sub.cpp
 create mode 100644 backends/cadence/hifi/operators/op_tanh.cpp
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c

diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
index f4f04e4eef..922762210c 100644
--- a/.github/workflows/_android.yml
+++ b/.github/workflows/_android.yml
@@ -66,6 +66,16 @@ jobs:
           # avoid permission issue
           sudo chown -R "${USER}" /opt/android
 
+      - name: Download Artifacts
+        shell: bash
+        run: |
+          set -eux
+          curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug.apk
+          curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug-androidTest.apk
+          curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/fp32-xnnpack-custom/model.zip
+          unzip model.zip
+          mv *.pte model.pte
+
       - name: Gradle cache
         uses: gradle/actions/setup-gradle@v3
 
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 4ba24f635e..dd5b432e5a 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -99,6 +99,8 @@ jobs:
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900
+      upload-artifact: android-models
+      upload-artifact-to-s3: true
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -107,13 +109,15 @@ jobs:
         DTYPE=${{ matrix.dtype }}
         BUILD_TOOL="cmake"
         MODE=${{ matrix.mode }}
+        ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${DTYPE}-${MODE}"
+        ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}"
 
         # Setup executorch
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" "${ARTIFACTS_DIR_NAME}"
 
   test-llama-runner-linux-android:
     name: test-llama-runner-linux-android
@@ -320,6 +324,7 @@ jobs:
 
   android:
     uses: ./.github/workflows/_android.yml
+    needs: test-llama-runner-linux
 
   unittest:
     uses: ./.github/workflows/_unittest.yml
diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
index 5084405c46..c7828888ee 100644
--- a/backends/apple/coreml/compiler/coreml_preprocess.py
+++ b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -425,12 +425,15 @@ def preprocess(
             CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
         )
 
+        # Load the model if MODEL_TYPE is 'COMPILED_MODEL'. This step is necessary because
+        # get_compiled_model_path() requires a loaded model.
+        skip_model_load = model_type != CoreMLBackend.MODEL_TYPE.COMPILED_MODEL
         mlmodel = ct.convert(
             model=edge_program,
             source="pytorch",
             convert_to="mlprogram",
             pass_pipeline=ct.PassPipeline.DEFAULT,
-            skip_model_load=True,
+            skip_model_load=skip_model_load,
             compute_precision=model_compute_precision,
             minimum_deployment_target=minimum_deployment_target,
             compute_units=compute_units,
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.h b/backends/apple/coreml/runtime/delegate/backend_delegate.h
index ed921fb35b..a6e012a448 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.h
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.h
@@ -28,7 +28,7 @@ class BackendDelegate {
         // Max models cache size in bytes.
         size_t max_models_cache_size = 10 * size_t(1024) * size_t(1024) * size_t(1024);
         // If set to `true`, delegate pre-warms the most recently used asset.
-        bool should_prewarm_asset = true;
+        bool should_prewarm_asset = false;
         // If set to `true`, delegate pre-warms the model in `init`.
         bool should_prewarm_model = true;
     };
diff --git a/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist b/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist
index df37a47755..899bf12bbe 100644
--- a/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist
+++ b/backends/apple/coreml/runtime/delegate/com.apple.executorchcoreml_config.plist
@@ -3,7 +3,7 @@
 <plist version="1.0">
 <dict>
 	<key>shouldPrewarmAsset</key>
-	<true/>
+	<false/>
 	<key>shouldPrewarmModel</key>
 	<true/>
 	<key>maxAssetsSizeInBytes</key>
diff --git a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
index ef114546fe..661f91aa70 100644
--- a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
+++ b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm
@@ -209,6 +209,12 @@ - (void)testStateProgramExecute {
 }
 #endif
 
+- (void)testAddMulCompiledProgramExecute {
+    NSURL *modelURL = [[self class] bundledResourceWithName:@"add_mul_compiled_coreml_all" extension:@"pte"];
+    XCTAssertNotNil(modelURL);
+    [self executeModelAtURL:modelURL nLoads:1 nExecutions:2];
+}
+
 - (void)executeMultipleModelsConcurrently:(NSArray<NSURL *> *)modelURLs
                                    nLoads:(NSUInteger)nLoads
                               nExecutions:(NSUInteger)nExecutions
diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
index 1cb29d7c96..6ff30636a3 100644
--- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
+++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
@@ -8,6 +8,7 @@
 
 /* Begin PBXBuildFile section */
 		8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 8307EB892C9262060011AE6D /* state_coreml_all.pte */; };
+		838CA6872CD1965700462190 /* add_mul_compiled_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 838CA6862CD1965700462190 /* add_mul_compiled_coreml_all.pte */; };
 		83BB78A02C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm in Sources */ = {isa = PBXBuildFile; fileRef = 83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */; };
 		83BB78BF2C66AAAE00274ED7 /* add_mul_coreml_all.bin in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */; };
 		83BB78C02C66AAAE00274ED7 /* add_mul_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */; };
@@ -122,6 +123,7 @@
 
 /* Begin PBXFileReference section */
 		8307EB892C9262060011AE6D /* state_coreml_all.pte */ = {isa = PBXFileReference; lastKnownFileType = file; name = state_coreml_all.pte; path = ../test/models/state_coreml_all.pte; sourceTree = "<group>"; };
+		838CA6862CD1965700462190 /* add_mul_compiled_coreml_all.pte */ = {isa = PBXFileReference; lastKnownFileType = file; name = add_mul_compiled_coreml_all.pte; path = ../test/models/add_mul_compiled_coreml_all.pte; sourceTree = "<group>"; };
 		83BB789E2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = ETCoreMLModelDebugInfo.h; path = ../sdk/ETCoreMLModelDebugInfo.h; sourceTree = "<group>"; };
 		83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = ETCoreMLModelDebugInfo.mm; path = ../sdk/ETCoreMLModelDebugInfo.mm; sourceTree = "<group>"; };
 		83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add_mul_coreml_all.bin; path = ../test/models/add_mul_coreml_all.bin; sourceTree = "<group>"; };
@@ -606,6 +608,7 @@
 				C98551992AD2542D009143F9 /* mul_coreml_all.bin */,
 				C985519C2AD2542D009143F9 /* mul_coreml_all.pte */,
 				C985519B2AD2542D009143F9 /* mv3_coreml_all.bin */,
+				838CA6862CD1965700462190 /* add_mul_compiled_coreml_all.pte */,
 				C98551982AD2542D009143F9 /* mv3_coreml_all.pte */,
 				83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */,
 				83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */,
@@ -680,6 +683,7 @@
 				C985519E2AD2542D009143F9 /* mv3_coreml_all.pte in Resources */,
 				C98551A02AD2542D009143F9 /* add_coreml_all.bin in Resources */,
 				C98551A22AD2542D009143F9 /* mul_coreml_all.pte in Resources */,
+				838CA6872CD1965700462190 /* add_mul_compiled_coreml_all.pte in Resources */,
 				8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */,
 				C98551A32AD2542D009143F9 /* add_coreml_all.pte in Resources */,
 			);
diff --git a/backends/apple/coreml/scripts/generate_test_models.sh b/backends/apple/coreml/scripts/generate_test_models.sh
index 0c1822aa82..001ba36239 100755
--- a/backends/apple/coreml/scripts/generate_test_models.sh
+++ b/backends/apple/coreml/scripts/generate_test_models.sh
@@ -31,3 +31,12 @@ done
 
 echo "Executorch: Generating stateful model"
 python3 "$SCRIPT_DIR_PATH/../runtime/test/export_stateful_model.py"
+
+COMPILE_MODELS=("add_mul")
+echo "Executorch: Generating compiled model"
+for MODEL in "${COMPILE_MODELS[@]}"
+do
+  echo "Executorch: Generating compiled $MODEL model" 
+  python3 -m examples.apple.coreml.scripts.export --model_name "$MODEL" --compile
+  mv -f "$MODEL""_compiled_coreml_all.pte" "$COREML_DIR_PATH/runtime/test/models"
+done
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 3a0cea4c80..b3ddecbc29 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -12,6 +12,7 @@
     AnnotateChannelsLastDimOrder,
 )
 from executorch.backends.arm._passes.cast_int64_pass import CastInt64ToInt32Pass
+from executorch.backends.arm._passes.conv1d_unsqueeze_pass import Conv1dUnsqueezePass
 from executorch.backends.arm._passes.convert_expand_copy_to_repeat import (
     ConvertExpandCopyToRepeatPass,
 )
@@ -69,6 +70,7 @@ def transform_to_backend_pipeline(
         self.add_pass(DecomposeDivPass())
         self.add_pass(InsertSqueezeAfterSumPass())
         self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSoftmaxesPass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
diff --git a/backends/arm/_passes/arm_pass_utils.py b/backends/arm/_passes/arm_pass_utils.py
index 0e74701ab6..280864cbc9 100644
--- a/backends/arm/_passes/arm_pass_utils.py
+++ b/backends/arm/_passes/arm_pass_utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 # Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
@@ -9,11 +10,57 @@
 import torch
 import torch.fx
 
+from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
+
+from torch._export.utils import (
+    get_buffer,
+    get_lifted_tensor_constant,
+    get_param,
+    is_buffer,
+    is_lifted_tensor_constant,
+    is_param,
+)
 from torch._ops import OpOverload
 from torch._subclasses.fake_tensor import FakeTensor
 
 
+def is_get_attr_node(node: torch.fx.Node) -> bool:
+    """
+    Returns true if the given node is a get attr node for a tensor of the model
+    """
+    return isinstance(node, torch.fx.Node) and node.op == "get_attr"
+
+
+def is_param_node(exp_prog: ExportedProgram, node: torch.fx.Node) -> bool:
+    return (
+        is_get_attr_node(node)
+        or is_param(exp_prog, node)
+        or is_buffer(exp_prog, node)
+        or is_lifted_tensor_constant(exp_prog, node)
+    )
+
+
+def get_param_tensor(
+    exp_prog: ExportedProgram, node: torch.fx.Node
+) -> Optional[torch.Tensor]:
+    if node is None:
+        return None
+    elif is_param(exp_prog, node):
+        return get_param(exp_prog, node)
+    elif is_buffer(exp_prog, node):
+        return get_buffer(exp_prog, node)
+    elif is_lifted_tensor_constant(exp_prog, node):
+        return get_lifted_tensor_constant(exp_prog, node)
+    elif is_get_attr_node(node):
+        # This is a hack to support both lifted and unlifted graph
+        try:
+            return getattr(node.graph.owning_module, node.target)
+        except AttributeError:
+            return getattr(exp_prog.graph_module, node.target)
+    raise RuntimeError(f"unsupported param type, {node.op}.")
+
+
 def create_node(
     graph: torch.fx.Graph,
     op_target: OpOverload,
diff --git a/backends/arm/_passes/conv1d_unsqueeze_pass.py b/backends/arm/_passes/conv1d_unsqueeze_pass.py
new file mode 100644
index 0000000000..7fe5c6f7b6
--- /dev/null
+++ b/backends/arm/_passes/conv1d_unsqueeze_pass.py
@@ -0,0 +1,164 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_param_tensor,
+    insert_q_dq_pair,
+    is_param_node,
+)
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
+from executorch.exir import ExportedProgram
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class Conv1dUnsqueezePass(ExportPass):
+    """
+    This pass is used to change conv1d ops into conv2d since TOSA only
+    supports 2d and 3d convolution. This is done by modifying the graph to do the
+    following:
+    1) unsqueeze the convolution's input from 3d to 4d
+    2) if the input to unsqueeze is quantized, insert q/dq-pair after unsqueeze
+    3) perform a conv2d (with a modified version of the original conv1d args)
+    4) squeeze the output back down to 3d.
+    5) if all users of squeeze are quantized, insert q/dq-pair before squeeze
+    """
+
+    def __init__(self, exported_program: ExportedProgram) -> None:
+        super().__init__()
+        self.exported_program = exported_program
+
+    def unsqueeze_kernel_weights(self, kernel_node):
+        """
+        Unsqueezes the weights of a conv1d to make it 4 dimensional.
+
+        Args:
+            kernel_node: the weights of conv1d node to be unsqueezed
+        """
+        kernel_param_3d = get_param_tensor(self.exported_program, kernel_node)
+        if kernel_param_3d is None:
+            raise AssertionError("Expected param tensor for the kernel node")
+
+        kernel_param_4d = torch.nn.Parameter(
+            data=kernel_param_3d.data.contiguous().unsqueeze(dim=-1),
+            requires_grad=False,
+        )
+
+        if torch._export.utils.is_param(self.exported_program, kernel_node):
+            parameter_name = self.exported_program.graph_signature.inputs_to_parameters[
+                kernel_node.name
+            ]
+            self.exported_program.state_dict[parameter_name] = kernel_param_4d
+            kernel_node.meta["val"] = kernel_node.meta["val"].data.unsqueeze(dim=-1)
+        elif torch._export.utils.is_buffer(self.exported_program, kernel_node):
+            buffer_name = self.exported_program.graph_signature.inputs_to_buffers[
+                kernel_node.name
+            ]
+            self.exported_program.state_dict[buffer_name] = kernel_param_4d
+            kernel_node.meta["val"] = kernel_node.meta["val"].data.unsqueeze(dim=-1)
+        elif torch._export.utils.is_lifted_tensor_constant(
+            self.exported_program, kernel_node
+        ):
+            buffer_name = (
+                self.exported_program.graph_signature.inputs_to_lifted_tensor_constants[
+                    kernel_node.name
+                ]
+            )
+            self.exported_program.constants[buffer_name] = kernel_param_4d
+            kernel_node.meta["val"] = kernel_node.meta["val"].data.unsqueeze(dim=-1)
+        else:
+            setattr(
+                kernel_node.graph.owning_module,
+                kernel_node.target,
+                kernel_param_4d,
+            )
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        node_list = list(graph.nodes)
+        for node in node_list:
+            if node.op == "call_function":
+                if node.target == exir_ops.edge.aten.convolution.default:
+                    stride = list(node.args[3])
+                    if len(stride) != 1:
+                        # skip conv if it is not 1d
+                        continue
+
+                    kernel_node = node.args[1]
+                    if kernel_node.target == dq_op:
+                        kernel_node = kernel_node.args[0]
+
+                    if not is_param_node(self.exported_program, kernel_node):
+                        raise AssertionError(
+                            "Expected op for convolution weight node to be a get_attr node or a parameter"
+                        )
+
+                    # Modify graph such that the conv changes from 1d to 2d
+                    self.unsqueeze_kernel_weights(kernel_node)
+
+                    # (b) Extend stride, padding, and dilation for extra dim
+                    node.args = (
+                        node.args[0],
+                        node.args[1],
+                        node.args[2],
+                        node.args[3] + [1],  # stride
+                        node.args[4] + [0],  # padding
+                        node.args[5] + [1],  # dilation
+                        node.args[6],
+                        node.args[7] + [0],
+                        node.args[8],
+                    )
+
+                    # c. Add unsqueeze to input (3d -> 4d) and squeeze to output (4d -> 3d)
+                    # unsqueeze -> conv2d -> squeeze
+                    with graph.inserting_before(node):
+                        input_node = node.args[0]
+                        unsqueeze_before = create_node(
+                            graph, exir_ops.edge.aten.unsqueeze_copy.default
+                        )
+                        unsqueeze_before.args = (
+                            input_node,  # Input is node's original input
+                            -1,  # Last Dimension
+                        )
+                        node.replace_input_with(input_node, unsqueeze_before)
+
+                    # If Quantized we must insert unsqueeze --> q --> dq --> node
+                    if input_node.target == dq_op:
+                        q_params = input_node.args[1:]
+                        insert_q_dq_pair(graph, unsqueeze_before, q_params)
+
+                    with graph.inserting_after(node):
+                        squeeze_after = create_node(
+                            graph,
+                            exir_ops.edge.aten.squeeze_copy.dims,
+                        )
+                        squeeze_after.args = (
+                            node,  # Input is the conv node
+                            [-1],  # Last dimension
+                        )
+                        original_users = [
+                            user for user in node.users if user != squeeze_after
+                        ]
+                        for user in original_users:
+                            user.replace_input_with(node, squeeze_after)
+
+                        # If quantized, insert conv2d --> q --> dq --> squeeze
+                    if all(
+                        original_user.target == q_op for original_user in original_users
+                    ):
+                        q_params = original_users[0].args[1:]
+                        insert_q_dq_pair(graph, node, q_params)
+
+        graph_module.recompile()
+        # Since we are overriding "call", we need to call the parent's "call"
+        # to retrace the graph and regenerate metadata
+        graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/arm/_passes/convert_split_to_slice.py b/backends/arm/_passes/convert_split_to_slice.py
index 787cebec9d..ed2dcd4008 100644
--- a/backends/arm/_passes/convert_split_to_slice.py
+++ b/backends/arm/_passes/convert_split_to_slice.py
@@ -70,4 +70,5 @@ def call(self, graph_module: torch.fx.GraphModule):
                     output_node.replace_all_uses_with(slice_node)
         graph.eliminate_dead_code()
         graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
diff --git a/backends/arm/_passes/tag_io_quant_pass.py b/backends/arm/_passes/tag_io_quant_pass.py
index 2fce6cf3fd..49990d8e5f 100644
--- a/backends/arm/_passes/tag_io_quant_pass.py
+++ b/backends/arm/_passes/tag_io_quant_pass.py
@@ -43,9 +43,9 @@ def call(self, graph_module: torch.fx.GraphModule):
 
             # tag dq of outputs
             if node.op == "output":
-                quant, *_ = node.args[0]
-                if self.is_dequant_node(quant):
-                    quant.meta["arm_override_partition"] = False
+                for quant in node.args[0]:
+                    if self.is_dequant_node(quant):
+                        quant.meta["arm_override_partition"] = False
 
         graph_module.recompile()
         return PassResult(graph_module, True)
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
new file mode 100644
index 0000000000..3b27554221
--- /dev/null
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -0,0 +1,298 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.backend_details import CompileSpec
+from parameterized import parameterized
+
+
+class Conv1d(torch.nn.Module):
+    """
+    Creates one or many chained 1D-convolutions. For multiple convolutions, the
+    respective parameteres are provided as lists.
+    """
+
+    def __init__(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        length=8,
+        nbr_conv=1,  # Number of chained convs
+        in_channels: Union[List, int, None] = None,
+        out_channels: Union[List, int, None] = None,
+        kernel_size: Union[List, Tuple, None] = None,
+        stride: Union[List, Tuple, None] = None,
+        padding: Union[List, Tuple, None] = None,
+        dilation: Union[List, Tuple, None] = None,
+        groups: Union[List, int, None] = None,
+        bias: Union[List, bool, None] = None,
+        padding_mode: Union[List, str, None] = None,
+        batches=1,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.nbr_convs = nbr_conv
+
+        # Handle default values
+        in_channels = [2] * nbr_conv if in_channels is None else in_channels
+        out_channels = [1 * nbr_conv] if out_channels is None else out_channels
+        kernel_size = [3] * nbr_conv if kernel_size is None else kernel_size
+        stride = [2] * nbr_conv if stride is None else stride
+        padding = [1] * nbr_conv if padding is None else padding
+        dilation = [1] * nbr_conv if dilation is None else dilation
+        groups = [1] * nbr_conv if groups is None else groups
+        bias = [True] * nbr_conv if bias is None else bias
+        padding_mode = ["zeros"] * nbr_conv if padding_mode is None else padding_mode
+
+        # This allows the input parameters to be either a single value or a list
+        # as type hint implies
+        if not isinstance(in_channels, List):
+            in_channels = [in_channels]
+        if not isinstance(out_channels, List):
+            out_channels = [out_channels]
+        if not isinstance(kernel_size, List):
+            kernel_size = [kernel_size]
+        if not isinstance(stride, List):
+            stride = [stride]
+        if not isinstance(padding, List):
+            padding = [padding]
+        if not isinstance(dilation, List):
+            dilation = [dilation]
+        if not isinstance(groups, List):
+            groups = [groups]
+        if not isinstance(bias, List):
+            bias = [bias]
+        if not isinstance(padding_mode, List):
+            padding_mode = [padding_mode]
+
+        # Generate test data if not provided
+        if inputs is None:
+            self.inputs = (torch.randn(batches, in_channels[0], length).to(dtype),)
+        else:
+            self.inputs = (inputs,)
+
+        # Build chain of convs
+        for i in range(self.nbr_convs):
+            setattr(
+                self,
+                f"conv_{i}",
+                torch.nn.Conv1d(
+                    in_channels=in_channels[i],
+                    out_channels=out_channels[i],
+                    kernel_size=kernel_size[i],
+                    stride=stride[i],
+                    padding=padding[i],
+                    dilation=dilation[i],
+                    groups=groups[i],
+                    bias=bias[i],
+                    padding_mode=padding_mode[i],
+                ).to(dtype),
+            )
+
+    def get_inputs(self):
+        return self.inputs
+
+    def forward(self, x):
+        for i in range(self.nbr_convs):
+            conv = getattr(self, f"conv_{i}")
+            x = conv(x)
+        return x
+
+
+conv1d_2_3x2x40_nobias = Conv1d(
+    in_channels=2,
+    out_channels=3,
+    kernel_size=2,
+    stride=1,
+    bias=False,
+    padding=0,
+    length=40,
+    batches=1,
+)
+
+conv1d_3_1x3x256_st1 = Conv1d(
+    in_channels=3,
+    out_channels=10,
+    kernel_size=3,
+    stride=1,
+    padding=0,
+    length=256,
+    batches=1,
+)
+
+conv1d_3_1x3x12_st2_pd1 = Conv1d(
+    in_channels=3,
+    out_channels=4,
+    kernel_size=3,
+    stride=2,
+    padding=1,
+    length=12,
+    batches=1,
+)
+
+conv1d_1_1x2x128_st1 = Conv1d(
+    in_channels=2,
+    out_channels=1,
+    kernel_size=1,
+    stride=1,
+    padding=0,
+    length=128,
+    batches=1,
+)
+
+conv1d_2_1x2x14_st2 = Conv1d(
+    in_channels=2,
+    out_channels=1,
+    kernel_size=2,
+    stride=2,
+    padding=0,
+    length=14,
+    batches=1,
+)
+
+conv1d_5_3x2x128_st1 = Conv1d(
+    in_channels=2,
+    out_channels=3,
+    kernel_size=5,
+    stride=1,
+    padding=0,
+    length=128,
+    batches=3,
+)
+
+conv1d_3_1x3x224_st2_pd1 = Conv1d(
+    in_channels=3,
+    out_channels=16,
+    kernel_size=3,
+    stride=2,
+    padding=1,
+    length=224,
+    batches=1,
+)
+
+two_conv1d_nobias = Conv1d(
+    nbr_conv=2,
+    length=256,
+    in_channels=[3, 10],
+    out_channels=[10, 15],
+    kernel_size=[5, 5],
+    stride=[1, 1],
+    padding=[0, 0],
+    bias=[False, False],
+    batches=1,
+)
+
+two_conv1d = Conv1d(
+    nbr_conv=2,
+    length=256,
+    in_channels=[3, 10],
+    out_channels=[10, 15],
+    kernel_size=[5, 5],
+    stride=[1, 1],
+    padding=[0, 0],
+    bias=[True, True],
+    batches=1,
+)
+
+# Shenanigan to get a nicer output when test fails. With unittest it looks like:
+# FAIL: test_conv1d_tosa_BI_2_3x3_1x3x12x12_st2_pd1
+testsuite = [
+    ("2_3x2x40_nobias", conv1d_2_3x2x40_nobias),
+    ("3_1x3x256_st1", conv1d_3_1x3x256_st1),
+    ("3_1x3x12_st2_pd1", conv1d_3_1x3x12_st2_pd1),
+    ("1_1x2x128_st1", conv1d_1_1x2x128_st1),
+    ("2_1x2x14_st2", conv1d_2_1x2x14_st2),
+    ("5_3x2x128_st1", conv1d_5_3x2x128_st1),
+    ("3_1x3x224_st2_pd1", conv1d_3_1x3x224_st2_pd1),
+    ("two_conv1d_nobias", two_conv1d_nobias),
+    ("two_conv1d", two_conv1d),
+]
+
+
+class TestConv1D(unittest.TestCase):
+    def _test_conv1d_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
+            )
+            .export()
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_conv1d_tosa_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True),
+            )
+            .quantize()
+            .export()
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+        )
+
+    def _test_conv1d_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        compile_spec: CompileSpec,
+        test_data: Tuple[torch.Tensor],
+    ):
+        (
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
+            .quantize()
+            .export()
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
+            .to_executorch()
+        )
+
+    @parameterized.expand(testsuite)
+    def test_conv1d_tosa_MI(self, test_name, model):
+        self._test_conv1d_tosa_MI_pipeline(model, model.get_inputs())
+
+    @parameterized.expand(testsuite)
+    def test_conv1d_tosa_BI(self, test_name, model):
+        self._test_conv1d_tosa_BI_pipeline(model, model.get_inputs())
+
+    # Expeted to fail as Conv1D requires transpoes which isn't supported on u55
+    @parameterized.expand(testsuite)
+    @unittest.expectedFailure
+    def test_conv1d_u55_BI(self, test_name, model):
+        self._test_conv1d_ethosu_BI_pipeline(
+            model, common.get_u55_compile_spec(), model.get_inputs()
+        )
+
+    @parameterized.expand(testsuite)
+    def test_conv1d_u85_BI(self, test_name, model):
+        self._test_conv1d_ethosu_BI_pipeline(
+            model, common.get_u85_compile_spec(), model.get_inputs()
+        )
diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv2d.py
similarity index 98%
rename from backends/arm/test/ops/test_conv.py
rename to backends/arm/test/ops/test_conv2d.py
index decf790ce5..46adfc8a01 100644
--- a/backends/arm/test/ops/test_conv.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -6,7 +6,7 @@
 
 import unittest
 
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from executorch.backends.arm.test import common
@@ -18,13 +18,13 @@
 
 class Conv2d(torch.nn.Module):
     """
-    Creates one or many chained convolutions. For multiple convolutions, the
+    Creates one or many chained 2D-convolutions. For multiple convolutions, the
     respective parameteres are provided as lists.
     """
 
     def __init__(
         self,
-        inputs: torch.Tensor = None,
+        inputs: Optional[torch.Tensor] = None,
         height=8,
         width=8,
         nbr_conv=1,  # Number of chained convs
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index a63066bee6..01ffbc1054 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import Tuple
@@ -13,14 +12,13 @@
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.ops.test_conv import Conv2d
+from executorch.backends.arm.test.ops.test_conv1d import Conv1d
+from executorch.backends.arm.test.ops.test_conv2d import Conv2d
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 """
 The configuration when
@@ -29,6 +27,29 @@
   where K is a positive integer
 is termed in literature as depthwise convolution.
 """
+
+dw_conv1d_3_1x3x14_gp3_st1 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=7,
+    stride=1,
+    groups=3,
+    length=14,
+    batches=1,
+    padding=3,
+)
+
+dw_conv1d_2_1x6x4_gp6_st1 = Conv1d(
+    in_channels=6,
+    out_channels=12,
+    kernel_size=2,
+    stride=1,
+    groups=6,
+    padding=0,
+    length=4,
+    batches=1,
+)
+
 dw_conv2d_2x2_1x6x4x4_gp6_st1 = Conv2d(
     in_channels=6,
     out_channels=12,
@@ -41,6 +62,17 @@
     batches=1,
 )
 
+dw_conv1d_3_1x3x256_gp3_st1 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=3,
+    stride=1,
+    groups=3,
+    padding=0,
+    length=256,
+    batches=1,
+)
+
 dw_conv2d_3x3_1x3x256x256_gp3_st1 = Conv2d(
     in_channels=3,
     out_channels=3,
@@ -89,6 +121,19 @@
     batches=1,
 )
 
+two_dw_conv1d = Conv1d(
+    nbr_conv=2,
+    length=64,
+    in_channels=[4, 8],
+    out_channels=[8, 24],
+    kernel_size=[3, 3],
+    stride=[1, 1],
+    padding=[0, 0],
+    groups=[4, 8],
+    bias=[True, True],
+    batches=1,
+)
+
 two_dw_conv2d = Conv2d(
     nbr_conv=2,
     width=64,
@@ -104,7 +149,7 @@
 )
 
 # Shenanigan to get a nicer output when test fails.
-testsuite = [
+testsuite_conv2d = [
     ("2x2_1x6x4x4_gp6_st1", dw_conv2d_2x2_1x6x4x4_gp6_st1),
     ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1),
     ("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1),
@@ -113,12 +158,19 @@
     ("two_dw_conv2d", two_dw_conv2d),
 ]
 
+testsuite_conv1d = [
+    ("2_1x6x4_gp6_st1", dw_conv1d_2_1x6x4_gp6_st1),
+    ("3_1x3x256_gp3_st1", dw_conv1d_3_1x3x256_gp3_st1),
+    ("two_dw_conv1d", two_dw_conv1d),
+    ("3_1x3x14_gp3_st1", dw_conv1d_3_1x3x14_gp3_st1),
+]
+
 
-class TestDepthwiseConv2D(unittest.TestCase):
-    """Tests Conv2D where groups == in_channels and out_channels = K * in_channels. This
+class TestDepthwiseConv(unittest.TestCase):
+    """Tests Conv1D and Conv2D where groups == in_channels and out_channels = K * in_channels. This
     is a special case enables depthwise convolution."""
 
-    def _test_dw_conv2d_tosa_MI_pipeline(
+    def _test_dw_conv_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
         (
@@ -136,7 +188,7 @@ def _test_dw_conv2d_tosa_MI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_dw_conv2d_tosa_BI_pipeline(
+    def _test_dw_conv_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
         (
@@ -155,7 +207,7 @@ def _test_dw_conv2d_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
 
-    def _test_dw_conv2d_ethos_BI_pipeline(
+    def _test_dw_conv_ethos_BI_pipeline(
         self,
         module: torch.nn.Module,
         compile_spec: CompileSpec,
@@ -176,21 +228,36 @@ def _test_dw_conv2d_ethos_BI_pipeline(
             .to_executorch()
         )
 
-    @parameterized.expand(testsuite)
-    def test_dw_conv2d_tosa_MI(self, test_name: str, model: torch.nn.Module):
-        self._test_dw_conv2d_tosa_MI_pipeline(model, model.get_inputs())
+    @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
+    def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module):
+        self._test_dw_conv_tosa_MI_pipeline(model, model.get_inputs())
 
     # TODO: Investigate flakyness (MLTORCH-307)
-    @parameterized.expand(testsuite)
+    @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
     @pytest.mark.flaky(reruns=3)
-    def test_dw_conv2d_tosa_BI(self, test_name: str, model: torch.nn.Module):
-        self._test_dw_conv2d_tosa_BI_pipeline(model, model.get_inputs())
+    def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
+        self._test_dw_conv_tosa_BI_pipeline(model, model.get_inputs())
 
-    @parameterized.expand(testsuite, skip_on_empty=True)
+    @parameterized.expand(testsuite_conv2d, skip_on_empty=True)
     def test_dw_conv2d_u55_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
-        self._test_dw_conv2d_ethos_BI_pipeline(
+        self._test_dw_conv_ethos_BI_pipeline(
+            model,
+            common.get_u55_compile_spec(
+                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
+            ),
+            model.get_inputs(),
+        )
+
+    # Expected to fail as conv1d needs transpose which is not supported
+    # on u55.
+    @parameterized.expand(testsuite_conv1d, skip_on_empty=True)
+    @unittest.expectedFailure
+    def test_dw_conv1d_u55_BI(
+        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
+    ):
+        self._test_dw_conv_ethos_BI_pipeline(
             model,
             common.get_u55_compile_spec(
                 permute_memory_to_nhwc=True, quantize_io=set_quantize_io
@@ -198,11 +265,11 @@ def test_dw_conv2d_u55_BI(
             model.get_inputs(),
         )
 
-    @parameterized.expand(testsuite)
-    def test_dw_conv2d_u85_BI(
+    @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
+    def test_dw_conv_u85_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
-        self._test_dw_conv2d_ethos_BI_pipeline(
+        self._test_dw_conv_ethos_BI_pipeline(
             model,
             common.get_u85_compile_spec(
                 permute_memory_to_nhwc=True, quantize_io=set_quantize_io
diff --git a/backends/arm/test/passes/test_tag_io_quant_pass.py b/backends/arm/test/passes/test_tag_io_quant_pass.py
index 9f292bb7ca..639bf478bc 100644
--- a/backends/arm/test/passes/test_tag_io_quant_pass.py
+++ b/backends/arm/test/passes/test_tag_io_quant_pass.py
@@ -12,13 +12,13 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 
-class Add(torch.nn.Module):
+class TwoInputsTwoOutputs(torch.nn.Module):
 
     def get_inputs(self):
-        return (torch.rand(1, 10, 10, 10),)
+        return (torch.rand(1, 10, 10, 10), (torch.rand(1, 10, 10, 10)))
 
-    def forward(self, x):
-        return x + x
+    def forward(self, x, y):
+        return (x + y, x * y)
 
 
 class TestTagIOQuantPass(unittest.TestCase):
@@ -36,29 +36,29 @@ def _tosa_BI_u55_pipeline(self, module: torch.nn.Module):
             .to_edge()
             .check_count(
                 {
-                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 4
                 }
             )
             .check_count(
                 {
-                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 6
                 }
             )
             .partition()
             .check_count(
                 {
-                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 1
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2
                 }
             )
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_count(
                 {
-                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 1
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2
                 }
             )
             # .to_executorch() requires additional steps
         )
 
     def test_BI_u55_artifact(self):
-        model = Add()
+        model = TwoInputsTwoOutputs()
         self._tosa_BI_u55_pipeline(model)
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 167b99a328..3e9d3620cc 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -101,7 +101,7 @@ def _get_input_quantization_params(
             ):  # break early if we have all the inputs quantized parameters
                 break
     if len(quant_params) == 0:
-        raise RuntimeError("No Quantization parameters not found in exported model.")
+        raise RuntimeError("No Quantization parameters found in exported model.")
     return quant_params
 
 
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index b4ec338291..627f211557 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -25,7 +25,7 @@
 - op: add.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::add_out
+      kernel_name: cadence::impl::HiFi::add_out
 
 - op: bmm.out
   kernels:
@@ -45,12 +45,12 @@
 - op: div.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::div_out
+      kernel_name: cadence::impl::HiFi::div_out
 
 - op: div.out_mode
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::div_out_mode
+      kernel_name: cadence::impl::HiFi::div_out_mode
 
 - op: embedding.out
   kernels:
@@ -65,7 +65,7 @@
 - op: mul.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::mul_out
+      kernel_name: cadence::impl::HiFi::mul_out
 
 - op: permute_copy.out
   kernels:
@@ -75,7 +75,7 @@
 - op: sigmoid.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::sigmoid_out
+      kernel_name: cadence::impl::HiFi::sigmoid_out
 
 - op: slice_copy.Tensor_out
   kernels:
@@ -90,7 +90,12 @@
 - op: sub.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::sub_out
+      kernel_name: cadence::impl::HiFi::sub_out
+
+- op: tanh.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::tanh_out
 
 - op: view_copy.out
   kernels:
diff --git a/backends/cadence/cadence.cmake b/backends/cadence/cadence.cmake
index 25f241f205..0fa55c6a65 100644
--- a/backends/cadence/cadence.cmake
+++ b/backends/cadence/cadence.cmake
@@ -43,6 +43,9 @@ set(CMAKE_CXX_COMPILER ${TOOLCHAIN_HOME}/bin/${CROSS_COMPILE_TARGET}-clang++)
 
 set(CMAKE_C_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
 set(CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
+#workaround for larger compilation time
+set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fno-strict-aliasing")
+
 set(CMAKE_SYSROOT ${TOOLCHAIN_HOME}/${SYSROOT_TARGET})
 set(CMAKE_LINKER ${TOOLCHAIN_HOME}/bin/xt-ld)
 add_link_options(-lm -stdlib=libc++ -Wl,--no-as-needed -static)
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index d03bb1c01e..8fee7e8536 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -9,6 +9,10 @@ add_library(
   cadence_kernels
   kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
 )
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 57fe0e140d..70d5e39fad 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -11,6 +11,49 @@
 #include <inttypes.h>
 #include <stddef.h>
 #include <xa_type_def.h>
+/* For NNLIB APIs */
+#include "xa_nnlib_kernels_api.h"
+
+/* Potential NNLIB function/APIs */
+extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp1,
+    const WORD32* const p_inp1_shape,
+    const FLOAT32* __restrict__ p_inp2,
+    const WORD32* const p_inp2_shape);
+
+extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp1,
+    const WORD32* const p_inp1_shape,
+    const FLOAT32* __restrict__ p_inp2,
+    const WORD32* const p_inp2_shape);
+
+extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const FLOAT32* __restrict__ p_inp1,
+    const FLOAT32* __restrict__ p_inp2,
+    WORD32 num_elm,
+    WORD32 mode);
+
+extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp1,
+    const WORD32* const p_inp1_shape,
+    const FLOAT32* __restrict__ p_inp2,
+    const WORD32* const p_inp2_shape,
+    WORD32 mode);
+
+extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp1,
+    const WORD32* const p_inp1_shape,
+    const FLOAT32* __restrict__ p_inp2,
+    const WORD32* const p_inp2_shape);
 
 namespace cadence {
 namespace impl {
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 78413ef312..cbbb279e5d 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -20,6 +20,12 @@ endif()
 
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
@@ -29,24 +35,29 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_add.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
-)
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
+    )
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
new file mode 100644
index 0000000000..43cb0d8cd6
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_add.cpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::runtime::can_cast;
+using executorch::runtime::CppTypeToScalarType;
+using executorch::runtime::KernelRuntimeContext;
+using torch::executor::Error;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+namespace {
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void
+  run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
+    torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = a_casted + alpha_val * b_casted;
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+template <typename CTYPE_IN>
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug<CTYPE_IN> {};
+
+} // namespace
+
+Tensor& add_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    const Scalar& alpha,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbbf16_type(out),
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(a, b, out),
+      InvalidArgument,
+      out);
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType alpha_type =
+      torch::executor::native::utils::get_scalar_dtype(alpha);
+  ScalarType common_type =
+      executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::canCast(common_type, out_type),
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::check_alpha_type(alpha_type, common_type),
+      InvalidArgument,
+      out);
+
+  float alpha_val;
+  torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+  constexpr auto name = "add.out";
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+
+  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
+  bool optimized = 1;
+  /*find broadcast*/
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
+    optimized = 0;
+
+  if ((a_dim == 0) || (b_dim == 0))
+    optimized = 0;
+
+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
+
+  if (optimized) {
+    const float* const a_data = a.const_data_ptr<float>();
+    const float* const b_data = b.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    if (broadcast == 1) {
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < kNnlibMaxDim; i++) {
+        out_shape[i] = 1;
+        inp1_shape[i] = 1;
+        inp2_shape[i] = 1;
+      }
+
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
+
+      for (int i = 0; i < out.dim(); i++)
+        out_shape[i + off_o] = out.size(i);
+      for (int i = 0; i < a.dim(); i++)
+        inp1_shape[i + off_a] = a.size(i);
+      for (int i = 0; i < b.dim(); i++)
+        inp2_shape[i + off_b] = b.size(i);
+
+      xa_nn_elm_add_broadcast_4D_f32xf32_f32(
+          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
+    } else {
+      xa_nn_elm_add_f32xf32_f32(out_data, a_data, b_data, out.numel());
+    }
+
+    return out;
+  }
+
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      CTYPE_IN alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+      ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+        AddInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, alpha_val, out);
+      });
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp
new file mode 100644
index 0000000000..88e670b432
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_div.cpp
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+#include <cmath>
+
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using torch::executor::Error;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+namespace {
+
+ScalarType get_compute_type(ScalarType a_type, ScalarType b_type) {
+  if (executorch::runtime::isFloatingType(a_type) &&
+      executorch::runtime::isFloatingType(b_type)) {
+    return executorch::runtime::promoteTypes(a_type, b_type);
+  } else if (executorch::runtime::isFloatingType(a_type)) {
+    return a_type;
+  } else if (executorch::runtime::isFloatingType(b_type)) {
+    return b_type;
+  }
+  return ScalarType::Float;
+}
+
+} // namespace
+
+Tensor&
+div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+
+  ET_KERNEL_CHECK(
+      ctx,
+      !executorch::runtime::isComplexType(a_type) &&
+          !executorch::runtime::isQIntType(a_type) &&
+          !executorch::runtime::isBitsType(a_type),
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      ctx,
+      !executorch::runtime::isComplexType(b_type) &&
+          !executorch::runtime::isQIntType(b_type) &&
+          !executorch::runtime::isBitsType(b_type),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx, executorch::runtime::tensor_is_real_type(out), InvalidArgument, out);
+
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
+  bool optimized = 1;
+  /*find broadcast*/
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
+    optimized = 0;
+
+  if ((a_dim == 0) || (b_dim == 0))
+    optimized = 0;
+
+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
+
+  if (optimized) {
+    float* a_data = a.mutable_data_ptr<float>();
+    float* b_data = b.mutable_data_ptr<float>();
+    float* out_data = out.mutable_data_ptr<float>();
+
+    if (broadcast == 1) {
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < kNnlibMaxDim; i++) {
+        out_shape[i] = 1;
+        inp1_shape[i] = 1;
+        inp2_shape[i] = 1;
+      }
+
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
+      for (int i = 0; i < out.dim(); i++)
+        out_shape[i + off_o] = out.size(i);
+      for (int i = 0; i < a.dim(); i++)
+        inp1_shape[i + off_a] = a.size(i);
+      for (int i = 0; i < b.dim(); i++)
+        inp2_shape[i + off_b] = b.size(i);
+
+      xa_nn_elm_div_broadcast_4D_f32xf32_f32(
+          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
+    } else {
+      xa_nn_elm_div_f32xf32_f32(out_data, a_data, b_data, out.numel());
+    }
+
+    return out;
+  }
+
+  ScalarType common_type = get_compute_type(a_type, b_type);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::canCast(common_type, out_type),
+      InvalidArgument,
+      out);
+
+  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
+    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
+      ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
+        ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
+          torch::executor::
+              apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+                  [](const CTYPE_A val_a, const CTYPE_B val_b) {
+                    CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                    CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                    CTYPE_IN value = a_casted / b_casted;
+
+                    return static_cast<CTYPE_OUT>(value);
+                  },
+                  a,
+                  b,
+                  out);
+        });
+      });
+    });
+  });
+
+  return out;
+}
+
+Tensor& div_out_mode(
+    RuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    exec_aten::optional<exec_aten::string_view> mode,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type = get_compute_type(a_type, b_type);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(
+      ctx, executorch::runtime::tensor_is_real_type(out), InvalidArgument, out);
+
+  // Allow casting float -> integral here
+  // non-bool -> bool is still disallowed
+  ET_KERNEL_CHECK(
+      ctx,
+      !(common_type != ScalarType::Bool && out_type == ScalarType::Bool),
+      InvalidArgument,
+      out);
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
+  bool optimized = 1;
+  /*find broadcast*/
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
+    optimized = 0;
+
+  if ((a_dim == 0) || (b_dim == 0))
+    optimized = 0;
+
+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
+  int mode_val = -1;
+  if (mode.has_value() && mode.value() == "trunc")
+    mode_val = 0;
+  else if (mode.has_value() && mode.value() == "floor")
+    mode_val = 1;
+  else
+    optimized = 0;
+
+  if (optimized) {
+    float* a_data = a.mutable_data_ptr<float>();
+    float* b_data = b.mutable_data_ptr<float>();
+    float* out_data = out.mutable_data_ptr<float>();
+
+    if (broadcast) {
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < kNnlibMaxDim; i++) {
+        inp1_shape[i] = 1;
+        inp2_shape[i] = 1;
+        out_shape[i] = 1;
+      }
+
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
+
+      for (int i = 0; i < out.dim(); i++)
+        out_shape[i + off_o] = out.size(i);
+      for (int i = 0; i < a.dim(); i++)
+        inp1_shape[i + off_a] = a.size(i);
+      for (int i = 0; i < b.dim(); i++)
+        inp2_shape[i + off_b] = b.size(i);
+
+      xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
+          out_data,
+          out_shape,
+          a_data,
+          inp1_shape,
+          b_data,
+          inp2_shape,
+          mode_val);
+    } else {
+      xa_nn_elm_div_mode_f32xf32_f32(
+          out_data, a_data, b_data, out.numel(), mode_val);
+    }
+
+    return out;
+  }
+
+  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out_mode", CTYPE_A, [&]() {
+    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() {
+      ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() {
+        ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() {
+          torch::executor::
+              apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+                  [mode](const CTYPE_A val_a, const CTYPE_B val_b) {
+                    CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                    CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                    CTYPE_IN value = a_casted / b_casted;
+                    if (mode.has_value() && mode.value() == "trunc") {
+                      value = std::trunc(value);
+                    } else if (mode.has_value() && mode.value() == "floor") {
+                      value = std::floor(value);
+                    }
+                    return static_cast<CTYPE_OUT>(value);
+                  },
+                  a,
+                  b,
+                  out);
+        });
+      });
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp
new file mode 100644
index 0000000000..ad12606bdf
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_mul.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using executorch::runtime::can_cast;
+using executorch::runtime::CppTypeToScalarType;
+using torch::executor::Error;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+namespace {
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MulInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MulInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = a_casted * b_casted;
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MulInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+} // namespace
+
+Tensor&
+mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhb_type(out),
+      InvalidArgument,
+      out);
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type =
+      executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true);
+  ScalarType out_type = out.scalar_type();
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+
+  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
+  bool optimized = 1;
+  /*find broadcast*/
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
+    optimized = 0;
+
+  if ((a_dim == 0) || (b_dim == 0))
+    optimized = 0;
+
+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
+
+  if (optimized) {
+    float* a_data = a.mutable_data_ptr<float>();
+    float* b_data = b.mutable_data_ptr<float>();
+    float* out_data = out.mutable_data_ptr<float>();
+
+    if (broadcast == 1) {
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
+      for (int i = 0; i < kNnlibMaxDim; i++) {
+        out_shape[i] = 1;
+        inp1_shape[i] = 1;
+        inp2_shape[i] = 1;
+      }
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
+      for (int i = 0; i < out.dim(); i++)
+        out_shape[i + off_o] = out.size(i);
+      for (int i = 0; i < a.dim(); i++)
+        inp1_shape[i + off_a] = a.size(i);
+      for (int i = 0; i < b.dim(); i++)
+        inp2_shape[i + off_b] = b.size(i);
+
+      xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
+          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
+    } else {
+      xa_nn_elm_mul_f32xf32_f32(out_data, a_data, b_data, out.numel());
+    }
+
+    return out;
+  }
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
+        MulInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
+      });
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_sigmoid.cpp b/backends/cadence/hifi/operators/op_sigmoid.cpp
new file mode 100644
index 0000000000..b9fa73b879
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_sigmoid.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using torch::executor::Error;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx, in.scalar_type() != ScalarType::Bool, InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_floating_type(out),
+      InvalidArgument,
+      out);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ScalarType in_type = in.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  bool optimized = 1;
+  if ((in_type != ScalarType::Float) || (out_type != ScalarType::Float))
+    optimized = 0;
+
+  if (optimized) {
+    float* data_in = in.mutable_data_ptr<float>();
+    float* data_out = out.mutable_data_ptr<float>();
+    xa_nn_vec_sigmoid_f32_f32(data_out, data_in, in.numel());
+
+    return out;
+  }
+
+  ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() {
+    ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() {
+      torch::executor::apply_unary_map_fn(
+          [](const CTYPE_IN val_in) {
+            // perform math in double to preserve precision
+            double in_casted = static_cast<double>(val_in);
+            double out_val = 1.0 / (1.0 + exp(-in_casted));
+            return static_cast<CTYPE_OUT>(out_val);
+          },
+          in.const_data_ptr<CTYPE_IN>(),
+          out.mutable_data_ptr<CTYPE_OUT>(),
+          in.numel());
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp
new file mode 100644
index 0000000000..0a362dbf95
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_sub.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using executorch::runtime::can_cast;
+using executorch::runtime::CppTypeToScalarType;
+using torch::executor::Error;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+namespace {
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct SubInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct SubInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void
+  run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
+    torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = a_casted - alpha_val * b_casted;
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+template <typename CTYPE_IN>
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct SubInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug<CTYPE_IN> {};
+
+} // namespace
+
+Tensor& sub_out(
+    RuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    const Scalar& alpha,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realh_type(out),
+      InvalidArgument,
+      out);
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType alpha_type =
+      torch::executor::native::utils::get_scalar_dtype(alpha);
+  ScalarType common_type =
+      executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::canCast(common_type, out_type),
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::check_alpha_type(alpha_type, common_type),
+      InvalidArgument,
+      out);
+
+  float alpha_val;
+  torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+  constexpr auto name = "sub.out";
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+
+  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
+  bool optimized = 1;
+  /*find broadcast*/
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
+    optimized = 0;
+
+  if ((a_dim == 0) || (b_dim == 0))
+    optimized = 0;
+
+  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
+
+  if (optimized) {
+    /*logic to find broadcast*/
+    const int a_is_broadcasted = !out.sizes().equals(a.sizes());
+    const int b_is_broadcasted = !out.sizes().equals(b.sizes());
+    const int broadcast = (a_is_broadcasted || b_is_broadcasted);
+
+    const float* const a_data = a.const_data_ptr<float>();
+    const float* const b_data = b.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+    if (broadcast == 1) {
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < kNnlibMaxDim; i++) {
+        out_shape[i] = 1;
+        inp1_shape[i] = 1;
+        inp2_shape[i] = 1;
+      }
+
+      int off_o = kNnlibMaxDim - out_dim;
+      int off_a = kNnlibMaxDim - a_dim;
+      int off_b = kNnlibMaxDim - b_dim;
+      for (int i = 0; i < out_dim; i++)
+        out_shape[i + off_o] = out.size(i);
+      for (int i = 0; i < a_dim; i++)
+        inp1_shape[i + off_a] = a.size(i);
+      for (int i = 0; i < b_dim; i++)
+        inp2_shape[i + off_b] = b.size(i);
+
+      xa_nn_elm_sub_broadcast_4D_f32xf32_f32(
+          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
+    } else {
+      xa_nn_elm_sub_f32xf32_f32(out_data, a_data, b_data, out.numel());
+    }
+
+    return out;
+  }
+
+  ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALH_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      CTYPE_IN alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+      ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+        SubInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, alpha_val, out);
+      });
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_tanh.cpp b/backends/cadence/hifi/operators/op_tanh.cpp
new file mode 100644
index 0000000000..13578beb88
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_tanh.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <cmath>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using torch::executor::Error;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+Tensor& tanh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  bool optimized = 1;
+  if ((in.scalar_type() != ScalarType::Float) ||
+      (out.scalar_type() != ScalarType::Float))
+    optimized = 0;
+
+  if (optimized) {
+    float* data_in = in.mutable_data_ptr<float>();
+    float* data_out = out.mutable_data_ptr<float>();
+    xa_nn_vec_tanh_f32_f32(data_out, data_in, (int)in.numel());
+    return out;
+  }
+
+  return torch::executor::native::internal::
+      unary_ufunc_realhbbf16_to_floathbf16(std::tanh, ctx, in, out);
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
new file mode 100644
index 0000000000..9eab22b05b
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
@@ -0,0 +1,428 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+#include "xa_type_def.h"
+#include "xa_nnlib_common_fpu.h"
+#include "xa_nn_common.h"
+#include "xa_nnlib_err_chk.h"
+#include "xa_nnlib_kernels_api.h"
+
+
+#if HAVE_VFPU
+static void internal_elm_add_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  out_lc,
+                             WORD32  in_lc,
+                             xtbool  sign_flag)
+{
+  int i, j;
+
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2;
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  int num_simd2_ops;
+  int num_scalar_ops;
+
+  if(out_lc)
+  {
+    num_simd2_ops = in_lc >> 1;
+    num_scalar_ops = in_lc & 1;
+  }
+  else
+  {
+    num_simd2_ops = (in_lc >> 2) << 1;
+    num_scalar_ops = in_lc & 3;
+  }
+
+    xtfloatx2 x1, x2, y;
+    xtfloat a0, b0, c0;
+
+  /* For computing inp2 + inp1 */
+  if(sign_flag){
+    for(i = 0; i < out_lc; i++)
+    {
+      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+      p_b = (xtfloatx2 *)p_inp2;
+      p_c = (xtfloatx2 *)&p_out[i * in_lc];
+      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+      {
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+          y = XT_ADD_SX2(x2, x1);
+          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
+        }
+      }
+      else
+      {
+        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+        vinp1 = XT_LASX2PP(p_a);
+        vinp2 = XT_LASX2PP(p_b);
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LASX2IP(x1, vinp1, p_a);
+          XT_LASX2IP(x2, vinp2, p_b);
+          y = XT_ADD_SX2(x2, x1);
+          XT_SASX2IP(y, out_a, p_c);
+        }
+        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+      }
+      if(num_scalar_ops !=0)
+      {
+        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+        c0 = XT_ADD_S(b0, a0);
+        XT_SSI(c0, (xtfloat *)p_c, 0);
+      }
+    }
+  }
+  /* For computing inp1 + inp2 */
+  else
+  {
+    for(i = 0; i < out_lc; i++)
+    {
+      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+      p_b = (xtfloatx2 *)p_inp2;
+      p_c = (xtfloatx2 *)&p_out[i * in_lc];
+      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+      {
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+          y = XT_ADD_SX2(x1, x2);
+          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
+        }
+      }
+      else
+      {
+        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+        vinp1 = XT_LASX2PP(p_a);
+        vinp2 = XT_LASX2PP(p_b);
+
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LASX2IP(x1, vinp1, p_a);
+          XT_LASX2IP(x2, vinp2, p_b);
+          y = XT_ADD_SX2(x1, x2);
+          XT_SASX2IP(y, out_a, p_c);
+        }
+        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+      }
+      if(num_scalar_ops !=0)
+      {
+        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+        c0 = XT_ADD_S(a0, b0);
+        XT_SSI(c0, (xtfloat *)p_c, 0);
+      }
+    }
+  }
+}
+
+static void internal_elm_add_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  num_elm,
+                             xtbool  sign_flag)
+{
+  int i;
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2;
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  const int num_simd2_ops = num_elm >> 1;
+  const int num_scalar_ops = num_elm & 1;
+
+  xtfloat a0_7, out;
+  xtfloatx2 x1, x2, y;
+  x2 = XT_LSI((xtfloat *)p_b, 0);
+
+  /* For computing inp2 + inp1 */
+  if(sign_flag){
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        y = XT_ADD_SX2(x2, x1);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
+      }
+    }
+    else
+    {
+      ae_valign inp1_a, out_a;
+      inp1_a = XT_LASX2PP(p_a);
+      out_a = AE_ZALIGN64();
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LASX2IP(x1, inp1_a, p_a);
+        y = XT_ADD_SX2(x2, x1);
+        XT_SASX2IP(y, out_a, p_c);
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+    }
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+      out = XT_ADD_S(x2, a0_7);
+      XT_SSI(out, (xtfloat *)p_c, 0);
+    }
+  }
+  /* For computing inp1 + inp2 */
+  else
+  {
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        y = XT_ADD_SX2(x1, x2);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
+      }
+    }
+    else
+    {
+      ae_valign inp1_a, out_a;
+      inp1_a = XT_LASX2PP(p_a);
+      out_a = AE_ZALIGN64();
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LASX2IP(x1, inp1_a, p_a);
+        y = XT_ADD_SX2(x1, x2);
+        XT_SASX2IP(y, out_a, p_c);
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+    }
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+      out = XT_ADD_S(a0_7, x2);
+      XT_SSI(out, (xtfloat *)p_c, 0);
+    }
+  }
+}
+#endif
+
+WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * __restrict__ p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * __restrict__ p_inp2,
+                      const WORD32 *const p_inp2_shape)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
+
+  /* Check shapes */
+  int i;
+  xtbool sign_flag;
+  for(i = 0; i < 4; i++)
+  {
+    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
+       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
+    {
+      return -1;
+    }
+  }
+
+  WORD32 inp1_strides[4], inp2_strides[4];
+  inp1_strides[3] = 1;
+  inp2_strides[3] = 1;
+  for(i = 2; i >= 0; i--)
+  {
+    ae_int32x2 d_str, d_shape;
+    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
+    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
+    d_str = AE_MULP32X2(d_str, d_shape);
+    inp1_strides[i] = AE_MOVAD32_H(d_str);
+    inp2_strides[i] = AE_MOVAD32_L(d_str);
+  }
+
+  int need_broadcast = 0;
+  int inp1_const = 1, inp2_const = 1;
+  for(i = 0; i < 4; i++)
+  {
+    if(p_inp1_shape[i] != p_inp2_shape[i])
+    {
+      if(p_inp1_shape[i] == 1)
+        inp1_strides[i] = 0;
+      else
+        inp2_strides[i] = 0;
+
+      need_broadcast = 1;
+    }
+    if(p_inp1_shape[i] != 1)
+      inp1_const &= 0;
+    if(p_inp2_shape[i] != 1)
+      inp2_const &= 0;
+  }
+  int itr0, itr1, itr2;
+
+  FLOAT32 *p_out_tmp = p_out;
+  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
+  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
+  if(need_broadcast == 0)
+  {
+    sign_flag = 0;
+    internal_elm_add_broadcast_2D_f32xf32_f32(
+                p_out,
+                p_inp1,
+                p_inp2,
+                1,
+                p_out_shape[0] * inp1_strides[0],
+                sign_flag);
+  }
+  else if(inp1_strides[3] == inp2_strides[3])
+  {
+    WORD32 in_lc, out_lc;
+    sign_flag = 0;
+    in_lc = p_out_shape[2] * p_out_shape[3];
+    out_lc = 1;
+    if(inp1_strides[2] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[2];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+    else if(inp2_strides[2] == 0)
+    {
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        internal_elm_add_broadcast_2D_f32xf32_f32(
+            p_out_tmp,
+            p_inp1_tmp0,
+            p_inp2_tmp0,
+            out_lc,
+            in_lc,
+            sign_flag);
+        p_out_tmp += in_lc * out_lc;
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  else if(inp1_const == 1 || inp2_const == 1)
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      sign_flag = 1;
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+    }
+    internal_elm_add_broadcast_f32xf32_f32(
+        p_out_tmp,
+        p_inp1_tmp,
+        p_inp2_tmp,
+        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
+        sign_flag);
+  }
+  else
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[3];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+      tmp_strides[2] = inp1_strides[2];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+      inp1_strides[2] = inp2_strides[2];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      inp2_strides[2] = tmp_strides[2];
+    }
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
+        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
+        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
+        {
+          {
+            internal_elm_add_broadcast_f32xf32_f32(
+                p_out_tmp,
+                p_inp1_tmp1,
+                p_inp2_tmp1,
+                p_out_shape[3],
+                sign_flag);
+          }
+          p_out_tmp += p_out_shape[3];
+          p_inp1_tmp1 += inp1_strides[2];
+          p_inp2_tmp1 += inp2_strides[2];
+        }
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  return 0;
+
+}
+
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
new file mode 100644
index 0000000000..03b8d62518
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
@@ -0,0 +1,419 @@
+#include "xa_type_def.h"
+#include "xa_nnlib_common_fpu.h"
+#include "xa_nn_common.h"
+#include "xa_nnlib_err_chk.h"
+//#include "xa_nn_basic_state.h"
+#include "xa_nnlib_kernels_api.h"
+
+#if HAVE_VFPU
+static void internal_elm_div_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  out_lc,
+                             WORD32  in_lc,
+                             xtbool  sign_flag)
+{
+  int i, j;
+
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  int num_simd2_ops;
+  int num_scalar_ops;
+
+  if(out_lc)
+  {
+    num_simd2_ops = in_lc >> 1;
+    num_scalar_ops = in_lc & 1;
+  }
+  else
+  {
+    num_simd2_ops = (in_lc >> 2) << 1;
+    num_scalar_ops = in_lc & 3;
+  }
+
+    xtfloatx2 x1, x2, y;
+    xtfloat a0, b0, c0;
+
+  /* For computing inp2 - inp1 */   
+  if(sign_flag){  
+    for(i = 0; i < out_lc; i++)
+    {
+      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+      p_b = (xtfloatx2 *)p_inp2;
+      p_c = (xtfloatx2 *)&p_out[i * in_lc];
+      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+      {
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+          y = XT_DIV_SX2(x2, x1);
+          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+        }
+      }
+      else
+      {
+        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+        vinp1 = XT_LASX2PP(p_a);
+        vinp2 = XT_LASX2PP(p_b);
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LASX2IP(x1, vinp1, p_a);
+          XT_LASX2IP(x2, vinp2, p_b);
+          y = XT_DIV_SX2(x2, x1);
+          XT_SASX2IP(y, out_a, p_c); 
+        }
+        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+      }
+      if(num_scalar_ops !=0)
+      {
+        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+        c0 = XT_DIV_S(b0, a0);   
+        XT_SSI(c0, (xtfloat *)p_c, 0);
+      }      
+    }
+  }
+  /* For computing inp1 - inp2 */   
+  else
+  {
+    for(i = 0; i < out_lc; i++)
+    {
+      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+      p_b = (xtfloatx2 *)p_inp2;
+      p_c = (xtfloatx2 *)&p_out[i * in_lc];
+      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+      {
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+          y = XT_DIV_SX2(x1, x2);
+          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+        }
+      }
+      else
+      {
+        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+        vinp1 = XT_LASX2PP(p_a);
+        vinp2 = XT_LASX2PP(p_b);
+
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LASX2IP(x1, vinp1, p_a);
+          XT_LASX2IP(x2, vinp2, p_b);
+          y = XT_DIV_SX2(x1, x2);
+          XT_SASX2IP(y, out_a, p_c); 
+        }
+        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+      }
+      if(num_scalar_ops !=0)
+      {
+        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+        c0 = XT_DIV_S(a0, b0);   
+        XT_SSI(c0, (xtfloat *)p_c, 0);
+      }      
+    }  
+  }
+}
+
+static void internal_elm_div_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  num_elm,
+                             xtbool  sign_flag)
+{
+  int i;
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  const int num_simd2_ops = num_elm >> 1;
+  const int num_scalar_ops = num_elm & 1;
+
+  xtfloat a0_7, out;
+  xtfloatx2 x1, x2, y;
+  x2 = XT_LSI((xtfloat *)p_b, 0);
+        
+  /* For computing inp2 - inp1 */      
+  if(sign_flag){
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        y = XT_DIV_SX2(x2, x1);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+      }
+    }
+    else
+    {
+      ae_valign inp1_a, out_a;
+      inp1_a = XT_LASX2PP(p_a);
+      out_a = AE_ZALIGN64();      
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LASX2IP(x1, inp1_a, p_a);
+        y = XT_DIV_SX2(x2, x1);
+        XT_SASX2IP(y, out_a, p_c);
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
+    }  
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+      out = XT_DIV_S(x2, a0_7);   
+      XT_SSI(out, (xtfloat *)p_c, 0);
+    }
+  }
+  /* For computing inp1 - inp2 */   
+  else
+  {
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        y = XT_DIV_SX2(x1, x2);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+      }
+    }
+    else
+    {
+      ae_valign inp1_a, out_a;
+      inp1_a = XT_LASX2PP(p_a);
+      out_a = AE_ZALIGN64();       
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LASX2IP(x1, inp1_a, p_a);
+        y = XT_DIV_SX2(x1, x2);
+        XT_SASX2IP(y, out_a, p_c);
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+    }
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+      out = XT_DIV_S(a0_7, x2);   
+      XT_SSI(out, (xtfloat *)p_c, 0);
+    }    
+  }
+}
+#endif
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_div_broadcast_4D_f32xf32_f32,
+             (
+                      FLOAT32 * p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * p_inp2,
+                      const WORD32 *const p_inp2_shape
+              )
+           )
+#else           
+WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * __restrict__ p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * __restrict__ p_inp2,
+                      const WORD32 *const p_inp2_shape)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
+
+  /* Check shapes */
+  int i;
+  xtbool sign_flag;
+  for(i = 0; i < 4; i++)
+  {
+    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
+       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
+    {
+      return -1;
+    }
+  }
+
+  WORD32 inp1_strides[4], inp2_strides[4];
+  inp1_strides[3] = 1;
+  inp2_strides[3] = 1;
+  for(i = 2; i >= 0; i--)
+  {
+    ae_int32x2 d_str, d_shape;
+    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
+    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
+    d_str = AE_MULP32X2(d_str, d_shape);
+    inp1_strides[i] = AE_MOVAD32_H(d_str);
+    inp2_strides[i] = AE_MOVAD32_L(d_str);
+  }
+
+  int need_broadcast = 0;
+  int inp1_const = 1, inp2_const = 1;
+  for(i = 0; i < 4; i++)
+  {
+    if(p_inp1_shape[i] != p_inp2_shape[i])
+    {
+      if(p_inp1_shape[i] == 1)
+        inp1_strides[i] = 0;
+      else
+        inp2_strides[i] = 0;
+
+      need_broadcast = 1;
+    }
+    if(p_inp1_shape[i] != 1)
+      inp1_const &= 0;
+    if(p_inp2_shape[i] != 1)
+      inp2_const &= 0;
+  }
+  int itr0, itr1, itr2;
+
+  FLOAT32 *p_out_tmp = p_out;
+  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
+  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
+  if(need_broadcast == 0)
+  {
+    sign_flag = 0;
+    internal_elm_div_broadcast_2D_f32xf32_f32(
+                p_out,
+                p_inp1,
+                p_inp2,
+                1,
+                p_out_shape[0] * inp1_strides[0],
+                sign_flag);
+  }
+  else if(inp1_strides[3] == inp2_strides[3])
+  {
+    WORD32 in_lc, out_lc;
+    sign_flag = 0;
+    in_lc = p_out_shape[2] * p_out_shape[3];
+    out_lc = 1;
+    if(inp1_strides[2] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[2];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+    else if(inp2_strides[2] == 0)
+    {
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        internal_elm_div_broadcast_2D_f32xf32_f32(
+            p_out_tmp,
+            p_inp1_tmp0,
+            p_inp2_tmp0,
+            out_lc,
+            in_lc,
+            sign_flag);
+        p_out_tmp += in_lc * out_lc;
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  else if(inp1_const == 1 || inp2_const == 1)
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      sign_flag = 1;
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+    }
+    internal_elm_div_broadcast_f32xf32_f32(
+        p_out_tmp,
+        p_inp1_tmp,
+        p_inp2_tmp,
+        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
+        sign_flag);
+  }
+  else
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[3];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+      tmp_strides[2] = inp1_strides[2];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+      inp1_strides[2] = inp2_strides[2];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      inp2_strides[2] = tmp_strides[2];
+    }
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
+        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
+        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
+        {
+          {
+            internal_elm_div_broadcast_f32xf32_f32(
+                p_out_tmp,
+                p_inp1_tmp1,
+                p_inp2_tmp1,
+                p_out_shape[3], 
+                sign_flag);
+          }
+          p_out_tmp += p_out_shape[3];
+          p_inp1_tmp1 += inp1_strides[2];
+          p_inp2_tmp1 += inp2_strides[2];
+        }
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  return 0;
+}
+#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
new file mode 100644
index 0000000000..95b449f43f
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
@@ -0,0 +1,644 @@
+#include "xa_type_def.h"
+#include "xa_nnlib_common_fpu.h"
+#include "xa_nn_common.h"
+#include "xa_nnlib_err_chk.h"
+//#include "xa_nn_basic_state.h"
+#include "xa_nnlib_kernels_api.h"
+
+#if !HAVE_VFPU
+    DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_div_mode_f32xf32_f32,
+             (
+                FLOAT32 *p_out,
+                const FLOAT32 *p_inp1,
+                const FLOAT32 *p_inp2,
+                WORD32 num_elm,
+                WORD32 mode
+              )
+           )
+#else
+WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                               const FLOAT32 * __restrict__ p_inp1,
+                               const FLOAT32 * __restrict__ p_inp2,
+                               WORD32 num_elm,
+                               WORD32 mode)
+{
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+    /* Pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+    /* Basic Parameter checks */
+    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
+    XA_NNLIB_ARG_CHK_COND(((mode != 0) && (mode != 1)), -1);
+
+    int i;
+    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
+    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
+    xtfloatx2 *out =  (xtfloatx2 *)p_out;
+    xtfloatx2 x1, x2, y;
+    ae_valign inp1_a, inp2_a, out_a;
+
+    inp1_a = XT_LASX2PP(inp1);
+    inp2_a = XT_LASX2PP(inp2);
+    out_a = AE_ZALIGN64();
+    /* Each iteration of loop is independent so safe to use concurrent pragma */
+    if(mode == 0)
+    {
+#pragma concurrent /* Each iteration of loop is independent so safe to use concurrent pragma */
+      for(i=0;i < num_elm>>1;i++)
+      {
+          XT_LASX2IP(x1, inp1_a, inp1);
+          XT_LASX2IP(x2, inp2_a, inp2);
+          y = XT_DIV_SX2(x1, x2);
+          y = FITRUNC_SX2(y);
+          XT_SASX2IP(y, out_a, out);
+      }
+    }
+    else
+    {
+#pragma concurrent
+    for(i=0;i < num_elm>>1;i++)
+    {
+        XT_LASX2IP(x1, inp1_a, inp1);
+        XT_LASX2IP(x2, inp2_a, inp2);
+        y = XT_DIV_SX2(x1, x2);
+        y = FIFLOOR_SX2(y);
+        XT_SASX2IP(y, out_a, out);
+    }
+    }
+    XT_SASX2POSFP(out_a, out);
+
+    // Remainder Loop
+    if (num_elm & 1)
+    {
+        xtfloat a1, a2, a;
+        XT_LSIP(a1, (xtfloat *)inp1, 0);
+        XT_LSIP(a2, (xtfloat *)inp2, 0);
+        a = XT_DIV_S(a1, a2);
+      if(mode == 0)
+        a = FITRUNC_S(a);
+      else
+        a = FIFLOOR_S(a);
+        XT_SSI(a, (xtfloat *)out, 0);
+    }
+
+    return 0;
+}
+#endif
+
+#if HAVE_VFPU
+static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  out_lc,
+                             WORD32  in_lc,
+                             xtbool  sign_flag,
+                             WORD32 mode)
+{
+  int i, j;
+
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  int num_simd2_ops;
+  int num_scalar_ops;
+
+  if(out_lc)
+  {
+    num_simd2_ops = in_lc >> 1;
+    num_scalar_ops = in_lc & 1;
+  }
+  else
+  {
+    num_simd2_ops = (in_lc >> 2) << 1;
+    num_scalar_ops = in_lc & 3;
+  }
+
+    xtfloatx2 x1, x2, y;
+    xtfloat a0, b0, c0;
+
+  /* For computing inp2 - inp1 */   
+  if(sign_flag){  
+    for(i = 0; i < out_lc; i++)
+    {
+      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+      p_b = (xtfloatx2 *)p_inp2;
+      p_c = (xtfloatx2 *)&p_out[i * in_lc];
+      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+      {
+        if(mode == 0)
+        {
+          for(j = 0; j < num_simd2_ops; j++)
+          {
+            XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+            XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+            y = XT_DIV_SX2(x2, x1);
+            y = FITRUNC_SX2(y);
+            XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+          }
+        }
+        else
+        {
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+          y = XT_DIV_SX2(x2, x1);
+          y = FIFLOOR_SX2(y);
+          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+        }
+      }
+      }
+      else
+      {
+        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+        vinp1 = XT_LASX2PP(p_a);
+        vinp2 = XT_LASX2PP(p_b);
+        if(mode == 0)
+        {
+          for(j = 0; j < num_simd2_ops; j++)
+          {
+            XT_LASX2IP(x1, vinp1, p_a);
+            XT_LASX2IP(x2, vinp2, p_b);
+            y = XT_DIV_SX2(x2, x1);
+            y = FITRUNC_SX2(y);
+            XT_SASX2IP(y, out_a, p_c); 
+          }
+        }
+        else
+        {
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LASX2IP(x1, vinp1, p_a);
+          XT_LASX2IP(x2, vinp2, p_b);
+          y = XT_DIV_SX2(x2, x1);
+          y = FIFLOOR_SX2(y);
+          XT_SASX2IP(y, out_a, p_c); 
+        }
+        }
+        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+      }
+      if(num_scalar_ops !=0)
+      {
+        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+        c0 = XT_DIV_S(b0, a0);   
+        if(mode == 0)
+          c0 = FITRUNC_S(c0);
+        else
+        c0 = FIFLOOR_S(c0);
+        XT_SSI(c0, (xtfloat *)p_c, 0);
+      }      
+    }
+  }
+  /* For computing inp1 - inp2 */   
+  else
+  {
+    for(i = 0; i < out_lc; i++)
+    {
+      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+      p_b = (xtfloatx2 *)p_inp2;
+      p_c = (xtfloatx2 *)&p_out[i * in_lc];
+      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+      {
+        if(mode == 0)
+        {
+          for(j = 0; j < num_simd2_ops; j++)
+          {
+            XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+            XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+            y = XT_DIV_SX2(x1, x2);
+            y = FITRUNC_SX2(y);
+            XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+          }
+        }
+        else
+        {
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+          y = XT_DIV_SX2(x1, x2);
+          y = FIFLOOR_SX2(y);
+          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+        }
+      }
+      }/* if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))*/
+      else
+      {
+        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+        vinp1 = XT_LASX2PP(p_a);
+        vinp2 = XT_LASX2PP(p_b);
+        if(mode == 0)
+        {
+          for(j = 0; j < num_simd2_ops; j++)
+          {
+            XT_LASX2IP(x1, vinp1, p_a);
+            XT_LASX2IP(x2, vinp2, p_b);
+            y = XT_DIV_SX2(x1, x2);
+            y = FITRUNC_SX2(y);
+            XT_SASX2IP(y, out_a, p_c); 
+          }
+        }
+        else
+        {
+        for(j = 0; j < num_simd2_ops; j++)
+        {
+          XT_LASX2IP(x1, vinp1, p_a);
+          XT_LASX2IP(x2, vinp2, p_b);
+          y = XT_DIV_SX2(x1, x2);
+          y = FIFLOOR_SX2(y);
+          XT_SASX2IP(y, out_a, p_c); 
+        }
+        }
+        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+      }
+      if(num_scalar_ops !=0)
+      {
+        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+        c0 = XT_DIV_S(a0, b0);   
+        if(mode == 0)
+          c0 = FITRUNC_S(c0);
+        else
+        c0 = FIFLOOR_S(c0);
+        XT_SSI(c0, (xtfloat *)p_c, 0);
+      }      
+    }  
+  }
+}
+
+static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  num_elm,
+                             xtbool  sign_flag,
+                             WORD32 mode)
+{
+  int i;
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  const int num_simd2_ops = num_elm >> 1;
+  const int num_scalar_ops = num_elm & 1;
+
+  xtfloat a0_7, out;
+  xtfloatx2 x1, x2, y;
+  x2 = XT_LSI((xtfloat *)p_b, 0);
+        
+  /* For computing inp2 - inp1 */      
+  if(sign_flag){
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      if(mode == 0)
+      {
+        for(i=0; i<num_simd2_ops; i++)
+        {
+          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+          y = XT_DIV_SX2(x2, x1);
+          y = FITRUNC_SX2(y);
+          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+        }
+      }
+      else
+      {
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        y = XT_DIV_SX2(x2, x1);
+        y = FIFLOOR_SX2(y);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+      }
+    }
+    }
+    else
+    {
+      ae_valign inp1_a, out_a;
+      inp1_a = XT_LASX2PP(p_a);
+      out_a = AE_ZALIGN64();      
+      if(mode == 0)
+      {
+      for(i=0; i<num_simd2_ops; i++)
+        {
+          XT_LASX2IP(x1, inp1_a, p_a);
+          y = XT_DIV_SX2(x2, x1);
+          y = FITRUNC_SX2(y);
+          XT_SASX2IP(y, out_a, p_c);
+        }
+      }
+      else
+      {
+        for(i=0; i<num_simd2_ops; i++)
+        {
+        XT_LASX2IP(x1, inp1_a, p_a);
+        y = XT_DIV_SX2(x2, x1);
+        y = FIFLOOR_SX2(y);
+        XT_SASX2IP(y, out_a, p_c);
+        }          
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
+    }  
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+      out = XT_DIV_S(x2, a0_7);   
+      if(mode == 0)
+        out = FITRUNC_S(out);
+      else
+      out = FIFLOOR_S(out);
+      XT_SSI(out, (xtfloat *)p_c, 0);
+    }
+  }
+  /* For computing inp1 - inp2 */   
+  else
+  {
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      if(mode == 0)
+      {
+        for(i=0; i<num_simd2_ops; i++)
+        {
+          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+          y = XT_DIV_SX2(x1, x2);
+          y = FITRUNC_SX2(y);
+          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+        }
+      }
+      else
+    {
+      for(i=0; i<num_simd2_ops; i++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        y = XT_DIV_SX2(x1, x2);
+        y = FIFLOOR_SX2(y);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+      }
+    }
+    } /* if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) */
+    else
+    {
+      ae_valign inp1_a, out_a;
+      inp1_a = XT_LASX2PP(p_a);
+      out_a = AE_ZALIGN64();       
+      if(mode == 0)
+      {
+      for(i=0; i<num_simd2_ops; i++)
+        {
+          XT_LASX2IP(x1, inp1_a, p_a);
+          y = XT_DIV_SX2(x1, x2);
+          y = FITRUNC_SX2(y);
+          XT_SASX2IP(y, out_a, p_c);
+        }
+      }
+      else
+      {
+        for(i=0; i<num_simd2_ops; i++)
+        {
+        XT_LASX2IP(x1, inp1_a, p_a);
+        y = XT_DIV_SX2(x1, x2);
+        y = FIFLOOR_SX2(y);
+        XT_SASX2IP(y, out_a, p_c);
+        }
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+    }
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+      out = XT_DIV_S(a0_7, x2);   
+      if(mode == 0)
+        out = FITRUNC_S(out);
+      else
+      out = FIFLOOR_S(out);
+      XT_SSI(out, (xtfloat *)p_c, 0);
+    }    
+  }
+}
+#endif
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32,
+             (
+                      FLOAT32 * p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * p_inp2,
+                      const WORD32 *const p_inp2_shape,
+                      WORD32 mode
+              )
+           )
+#else           
+WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * __restrict__ p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * __restrict__ p_inp2,
+                      const WORD32 *const p_inp2_shape,
+                      WORD32 mode)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
+
+  /* Check shapes */
+  int i;
+  xtbool sign_flag;
+  for(i = 0; i < 4; i++)
+  {
+    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
+       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
+    {
+      return -1;
+    }
+  }
+  XA_NNLIB_ARG_CHK_COND(((mode != 0) && (mode != 1)), -1);
+
+  WORD32 inp1_strides[4], inp2_strides[4];
+  inp1_strides[3] = 1;
+  inp2_strides[3] = 1;
+  for(i = 2; i >= 0; i--)
+  {
+    ae_int32x2 d_str, d_shape;
+    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
+    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
+    d_str = AE_MULP32X2(d_str, d_shape);
+    inp1_strides[i] = AE_MOVAD32_H(d_str);
+    inp2_strides[i] = AE_MOVAD32_L(d_str);
+  }
+
+  int need_broadcast = 0;
+  int inp1_const = 1, inp2_const = 1;
+  for(i = 0; i < 4; i++)
+  {
+    if(p_inp1_shape[i] != p_inp2_shape[i])
+    {
+      if(p_inp1_shape[i] == 1)
+        inp1_strides[i] = 0;
+      else
+        inp2_strides[i] = 0;
+
+      need_broadcast = 1;
+    }
+    if(p_inp1_shape[i] != 1)
+      inp1_const &= 0;
+    if(p_inp2_shape[i] != 1)
+      inp2_const &= 0;
+  }
+  int itr0, itr1, itr2;
+
+  FLOAT32 *p_out_tmp = p_out;
+  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
+  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
+  if(need_broadcast == 0)
+  {
+    sign_flag = 0;
+    internal_elm_div_mode_broadcast_2D_f32xf32_f32(
+                p_out,
+                p_inp1,
+                p_inp2,
+                1,
+                p_out_shape[0] * inp1_strides[0],
+                sign_flag,
+                mode);
+  }
+  else if(inp1_strides[3] == inp2_strides[3])
+  {
+    WORD32 in_lc, out_lc;
+    sign_flag = 0;
+    in_lc = p_out_shape[2] * p_out_shape[3];
+    out_lc = 1;
+    if(inp1_strides[2] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[2];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+    else if(inp2_strides[2] == 0)
+    {
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        internal_elm_div_mode_broadcast_2D_f32xf32_f32(
+            p_out_tmp,
+            p_inp1_tmp0,
+            p_inp2_tmp0,
+            out_lc,
+            in_lc,
+            sign_flag,
+            mode);
+        p_out_tmp += in_lc * out_lc;
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  else if(inp1_const == 1 || inp2_const == 1)
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      sign_flag = 1;
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+    }
+    internal_elm_div_mode_broadcast_f32xf32_f32(
+        p_out_tmp,
+        p_inp1_tmp,
+        p_inp2_tmp,
+        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
+        sign_flag,
+        mode);
+  }
+  else
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[3];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+      tmp_strides[2] = inp1_strides[2];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+      inp1_strides[2] = inp2_strides[2];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      inp2_strides[2] = tmp_strides[2];
+    }
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
+        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
+        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
+        {
+          {
+            internal_elm_div_mode_broadcast_f32xf32_f32(
+                p_out_tmp,
+                p_inp1_tmp1,
+                p_inp2_tmp1,
+                p_out_shape[3], 
+                sign_flag,
+                mode);
+          }
+          p_out_tmp += p_out_shape[3];
+          p_inp1_tmp1 += inp1_strides[2];
+          p_inp2_tmp1 += inp2_strides[2];
+        }
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  return 0;
+}
+#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
new file mode 100644
index 0000000000..b9aa102a15
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
@@ -0,0 +1,360 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+#include "xa_type_def.h"
+#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h"
+#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h"
+#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
+#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h"
+#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
+
+#if HAVE_VFPU
+static void internal_elm_mul_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  out_lc,
+                             WORD32  in_lc,
+                             xtbool  sign_flag)
+{
+  int i, j;
+
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  int num_simd2_ops;
+  int num_scalar_ops;
+
+  if(out_lc)
+  {
+    num_simd2_ops = in_lc >> 1;
+    num_scalar_ops = in_lc & 1;
+  }
+  else
+  {
+    num_simd2_ops = (in_lc >> 2) << 1;
+    num_scalar_ops = in_lc & 3;
+  }
+
+    xtfloatx2 x1, x2, y;
+    xtfloat a0, b0, c0;
+ 
+  for(i = 0; i < out_lc; i++)
+  {
+    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+    p_b = (xtfloatx2 *)p_inp2;
+    p_c = (xtfloatx2 *)&p_out[i * in_lc];
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+        y = XT_MUL_SX2(x2, x1);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+      }
+    }
+    else
+    {
+      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+      vinp1 = XT_LASX2PP(p_a);
+      vinp2 = XT_LASX2PP(p_b);
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LASX2IP(x1, vinp1, p_a);
+        XT_LASX2IP(x2, vinp2, p_b);
+        y = XT_MUL_SX2(x2, x1);
+        XT_SASX2IP(y, out_a, p_c); 
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+    }
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+      c0 = XT_MUL_S(b0, a0);   
+      XT_SSI(c0, (xtfloat *)p_c, 0);
+    }
+  }
+}
+
+static void internal_elm_mul_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  num_elm,
+                             xtbool  sign_flag)
+{
+  int i;
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  const int num_simd2_ops = num_elm >> 1;
+  const int num_scalar_ops = num_elm & 1;
+
+  xtfloat a0_7, out;
+  xtfloatx2 x1, x2, y;
+  x2 = XT_LSI((xtfloat *)p_b, 0);
+        
+  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+  {
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+      y = XT_MUL_SX2(x2, x1);
+      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+    }
+  }
+  else
+  {
+    ae_valign inp1_a, out_a;
+    inp1_a = XT_LASX2PP(p_a);
+    out_a = AE_ZALIGN64();      
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LASX2IP(x1, inp1_a, p_a);
+      y = XT_MUL_SX2(x2, x1);
+      XT_SASX2IP(y, out_a, p_c);
+    }
+    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
+  }  
+  if(num_scalar_ops !=0)
+  {
+    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+    out = XT_MUL_S(x2, a0_7);   
+    XT_SSI(out, (xtfloat *)p_c, 0);
+  }
+}
+#endif
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_mul_broadcast_4D_f32xf32_f32,
+             (
+                      FLOAT32 * p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * p_inp2,
+                      const WORD32 *const p_inp2_shape
+              )
+           )
+#else           
+WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * __restrict__ p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * __restrict__ p_inp2,
+                      const WORD32 *const p_inp2_shape)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
+
+  /* Check shapes */
+  int i;
+  xtbool sign_flag;
+  for(i = 0; i < 4; i++)
+  {
+    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
+       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
+    {
+      return -1;
+    }
+  }
+
+  WORD32 inp1_strides[4], inp2_strides[4];
+  inp1_strides[3] = 1;
+  inp2_strides[3] = 1;
+  for(i = 2; i >= 0; i--)
+  {
+    ae_int32x2 d_str, d_shape;
+    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
+    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
+    d_str = AE_MULP32X2(d_str, d_shape);
+    inp1_strides[i] = AE_MOVAD32_H(d_str);
+    inp2_strides[i] = AE_MOVAD32_L(d_str);
+  }
+
+  int need_broadcast = 0;
+  int inp1_const = 1, inp2_const = 1;
+  for(i = 0; i < 4; i++)
+  {
+    if(p_inp1_shape[i] != p_inp2_shape[i])
+    {
+      if(p_inp1_shape[i] == 1)
+        inp1_strides[i] = 0;
+      else
+        inp2_strides[i] = 0;
+
+      need_broadcast = 1;
+    }
+    if(p_inp1_shape[i] != 1)
+      inp1_const &= 0;
+    if(p_inp2_shape[i] != 1)
+      inp2_const &= 0;
+  }
+  int itr0, itr1, itr2;
+
+  FLOAT32 *p_out_tmp = p_out;
+  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
+  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
+  if(need_broadcast == 0)
+  {
+    sign_flag = 0;
+    internal_elm_mul_broadcast_2D_f32xf32_f32(
+                p_out,
+                p_inp1,
+                p_inp2,
+                1,
+                p_out_shape[0] * inp1_strides[0],
+                sign_flag);
+  }
+  else if(inp1_strides[3] == inp2_strides[3])
+  {
+    WORD32 in_lc, out_lc;
+    sign_flag = 0;
+    in_lc = p_out_shape[2] * p_out_shape[3];
+    out_lc = 1;
+    if(inp1_strides[2] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[2];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+    else if(inp2_strides[2] == 0)
+    {
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        internal_elm_mul_broadcast_2D_f32xf32_f32(
+            p_out_tmp,
+            p_inp1_tmp0,
+            p_inp2_tmp0,
+            out_lc,
+            in_lc,
+            sign_flag);
+        p_out_tmp += in_lc * out_lc;
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  else if(inp1_const == 1 || inp2_const == 1)
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      sign_flag = 1;
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+    }
+    internal_elm_mul_broadcast_f32xf32_f32(
+        p_out_tmp,
+        p_inp1_tmp,
+        p_inp2_tmp,
+        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
+        sign_flag);
+  }
+  else
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[3];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+      tmp_strides[2] = inp1_strides[2];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+      inp1_strides[2] = inp2_strides[2];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      inp2_strides[2] = tmp_strides[2];
+    }
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
+        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
+        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
+        {
+          {
+            internal_elm_mul_broadcast_f32xf32_f32(
+                p_out_tmp,
+                p_inp1_tmp1,
+                p_inp2_tmp1,
+                p_out_shape[3], 
+                sign_flag);
+          }
+          p_out_tmp += p_out_shape[3];
+          p_inp1_tmp1 += inp1_strides[2];
+          p_inp2_tmp1 += inp2_strides[2];
+        }
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  return 0;
+}
+#endif
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index d3d32266d8..8554ab9ed9 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -454,7 +454,6 @@ vTensor::vTensor(
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_map_uniform_(),
       logical_limits_uniform_(),
       // Construct Tensor storage
       storage_(
@@ -501,7 +500,6 @@ vTensor::vTensor(
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_map_uniform_(),
       logical_limits_uniform_(),
       // Construct Tensor storage
       storage_(context, image) {
@@ -527,7 +525,6 @@ vTensor::vTensor(vTensor& other)
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_map_uniform_(),
       logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_) {}
@@ -553,7 +550,6 @@ vTensor::vTensor(
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
-      axis_map_uniform_(),
       logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
@@ -630,14 +626,6 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
   return vkapi::BufferBindInfo(strides_uniform_.buffer());
 }
 
-const vkapi::BufferBindInfo vTensor::axis_map_ubo() {
-  if (!axis_map_uniform_.buffer()) {
-    axis_map_uniform_ =
-        ParamsBuffer(storage_.context_, utils::make_ivec4(axis_map_));
-  }
-  return vkapi::BufferBindInfo(axis_map_uniform_.buffer());
-}
-
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
   if (!logical_limits_uniform_.buffer()) {
     logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_);
@@ -710,9 +698,6 @@ void vTensor::update_metadata() {
   if (numel_uniform_.buffer()) {
     numel_uniform_.update(numel_);
   }
-  if (axis_map_uniform_.buffer()) {
-    axis_map_uniform_.update(utils::make_ivec4(axis_map_));
-  }
   if (logical_limits_uniform_.buffer()) {
     logical_limits_uniform_.update(logical_limits_);
   }
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index bd83e60038..35b74915d2 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -308,7 +308,6 @@ class vTensor final {
   ParamsBuffer sizes_uniform_;
   ParamsBuffer strides_uniform_;
   ParamsBuffer numel_uniform_;
-  ParamsBuffer axis_map_uniform_;
   ParamsBuffer logical_limits_uniform_;
 
   vTensorStorage storage_;
@@ -430,6 +429,19 @@ class vTensor final {
     return axis_map_;
   }
 
+  /*
+   * Returns a single int32_t that contains the values of the axis map and the
+   * packed dimension packed into a single int32_t, such that it can be used as
+   * a specialization constant in a compute shader. This allows for the SPIR-V
+   * to bytecode compilation to perform compile-time unfolding on the axis map.
+   * Each element of the axis map and the value of the packed dimension take up
+   * 4 bits in the packed int32_t.
+   */
+  inline int32_t hashed_layout() const {
+    return axis_map_.at(0) + (axis_map_.at(1) << 4) + (axis_map_.at(2) << 8) +
+        (axis_map_.at(3) << 12) + (packed_dim_ << 16);
+  }
+
   /*
    * Return true if the tensor's axis map is {0, 1, 2, concat_dim}. This means
    * that the width dim is mapped to the width axis of the texture, the height
@@ -463,12 +475,6 @@ class vTensor final {
    */
   const vkapi::BufferBindInfo strides_ubo();
 
-  /*
-   * Returns a GPU buffer containing the texture axis mapping for each dimension
-   * of the tensor, in WHCN dimension order.
-   */
-  const vkapi::BufferBindInfo axis_map_ubo();
-
   /*
    * Returns a GPU buffer containing the logical limits of the tensor. See the
    * comments for logical_limits() for more context.
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index f2d971a56b..cabf4e7a88 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -318,6 +318,10 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().estimate_memory_layout();
   }
 
+  inline int32_t hashed_layout_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().hashed_layout();
+  }
+
   inline int32_t packed_dim_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().packed_dim();
   }
@@ -338,10 +342,6 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().numel_ubo();
   }
 
-  inline vkapi::BufferBindInfo axis_map_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().axis_map_ubo();
-  }
-
   inline bool has_standard_axis_map(const ValueRef idx) {
     return values_.at(idx).toTensor().has_standard_axis_map();
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl
index 3d9bf885df..a4ed494fe6 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl
@@ -16,8 +16,6 @@ $if MAT2_IS_TRANSPOSED:
 $if HAS_BIAS:
   #define HAS_BIAS
 
-#include "indexing_utils.h"
-
 ${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
@@ -25,22 +23,32 @@ $if HAS_BIAS:
   ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
 ${layout_declare_ubo(B, "ivec4", "out_sizes")}
 ${layout_declare_ubo(B, "ivec3", "out_limits")}
-${layout_declare_ubo(B, "ivec4", "out_axis_map")}
 ${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat1_axis_map")}
 ${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat2_axis_map")}
 $if HAS_BIAS:
   ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
-  ${layout_declare_ubo(B, "ivec4", "bias_axis_map")}
   ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
 
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int out_packed_dim = C_DIM;
-layout(constant_id = 4) const int mat1_packed_dim = W_DIM;
-layout(constant_id = 5) const int mat2_packed_dim = H_DIM;
-layout(constant_id = 6) const int bias_packed_dim = W_DIM;
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int out_packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "mat1_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 mat1_axis_map = unhash_axis_map(mat1_layout);
+const lowp int mat1_packed_dim = unhash_packed_dim(mat1_layout);
+
+${layout_declare_spec_const(C, "int", "mat2_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 mat2_axis_map = unhash_axis_map(mat2_layout);
+const lowp int mat2_packed_dim = unhash_packed_dim(mat2_layout);
+
+$if HAS_BIAS:
+  ${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")}
+  const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout);
+  const lowp int bias_packed_dim = unhash_packed_dim(bias_layout);
 
 #ifdef HAS_BIAS
 vec4 get_bias_texel_W_packed(ivec3 logical_pos) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
index ad794d6db4..05c227f302 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
@@ -19,27 +19,35 @@ $if BATCH_MODE:
 $if HAS_BIAS:
   #define HAS_BIAS
 
-#include "indexing_utils.h"
-
 ${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
 $if HAS_BIAS:
   ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
 ${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "out_axis_map")}
 ${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat1_axis_map")}
 ${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat2_axis_map")}
 $if HAS_BIAS:
   ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
-  ${layout_declare_ubo(B, "ivec4", "bias_axis_map")}
   ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
 
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int out_packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int out_packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "mat1_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 mat1_axis_map = unhash_axis_map(mat1_layout);
+
+${layout_declare_spec_const(C, "int", "mat2_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 mat2_axis_map = unhash_axis_map(mat2_layout);
+
+$if HAS_BIAS:
+  ${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")}
+  const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout);
 
 // To convince the SPIR-V compiler to unroll the loops optimally, need this
 // macro
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index 3103c92aea..be0e1bfa20 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -14,26 +14,31 @@
 
 #define op(X, Y, A) ${OPERATOR}
 
-#include "broadcasting_utils.h"
-#include "indexing_utils.h"
-
 layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "out_axis_map")}
 ${layout_declare_ubo(B, "ivec4", "in_sizes")}
-${layout_declare_ubo(B, "ivec4", "in_axis_map")}
 ${layout_declare_ubo(B, "ivec4", "other_sizes")}
-${layout_declare_ubo(B, "ivec4", "other_axis_map")}
 ${layout_declare_ubo(B, "ivec2", "broadcast_params")}
 ${layout_declare_ubo(B, "float", "alpha")}
 
+#include "broadcasting_utils.h"
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+
+${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);
 
 void main() {
   const ivec3 lpos = ivec3(gl_GlobalInvocationID);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl
index 4fd6e2f14a..34e80b6ec1 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl
@@ -21,12 +21,13 @@ layout(std430) buffer;
 ${layout_declare_buffer(B, "w", "nchw_out", "int")}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_map")}
 ${layout_declare_ubo(B, "int", "out_numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 axis_map = unhash_axis_map(t_layout);
+const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 void main() {
   const int out_buf_idx = int(gl_GlobalInvocationID.x);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
index fe6d7ba7a9..e4880d8a22 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
@@ -14,8 +14,6 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils.h"
-
 layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
@@ -26,17 +24,26 @@ ${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec3", "out_limits")}
 ${layout_declare_ubo(B, "ivec4", "in_sizes")}
 
-${layout_declare_ubo(B, "ivec4", "out_axis_map")}
-${layout_declare_ubo(B, "ivec4", "in_axis_map")}
-${layout_declare_ubo(B, "ivec4", "kernel_axis_map")}
-${layout_declare_ubo(B, "ivec4", "bias_axis_map")}
-
 ${layout_declare_ubo(B,"int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
 
 ${layout_declare_ubo(B, "float", "out_min", "float", "out_max")}
 
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+
+${layout_declare_spec_const(C, "int", "kernel_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 kernel_axis_map = unhash_axis_map(kernel_layout);
+
+${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout);
+
 // Let us define
 //
 // input = (N, in_C, in_L),
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
index f02049dc2a..862ccdad30 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
@@ -14,17 +14,14 @@
 
 layout(std430) buffer;
 
-#include "indexing_utils.h"
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "existing_out", DTYPE, STORAGE)}
-${layout_declare_tensor(2, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
 
-${layout_declare_ubo(3, "ivec4", "out_sizes")}
-${layout_declare_ubo(4, "ivec4", "out_axis_map")}
-${layout_declare_ubo(5, "ivec4", "in_sizes")}
-${layout_declare_ubo(6, "ivec4", "in_axis_map")}
-layout(set = 0, binding = 7) uniform PRECISION restrict CopyArgs {
+layout(set = 0, binding = 5) uniform PRECISION restrict CopyArgs {
   // Operates on (x, y, z) logical extents.
   ivec3 range;
   // Analogus to range variable in copy. It defines the # of channel being
@@ -35,9 +32,16 @@ layout(set = 0, binding = 7) uniform PRECISION restrict CopyArgs {
   int src_channel_offset;
 };
 
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
 void main() {
   // Note: Unlike other shaders, the range is often not equal to the destination
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
index 7781fcb265..3dbc59e041 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -12,19 +12,23 @@
 
 ${define_active_storage_type(STORAGE)}
 
-#include "indexing_utils.h"
-
 layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 
 ${layout_declare_ubo(B, "ivec3", "range", "ivec3", "src_offset", "ivec3", "dst_offset")}
-${layout_declare_ubo(B, "ivec4", "out_axis_map")}
-${layout_declare_ubo(B, "ivec4", "in_axis_map")}
+
+#include "indexing_utils.h"
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
index 0a3eeee257..5c3de75634 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
@@ -14,19 +14,24 @@
 
 layout(std430) buffer;
 
-#include "indexing_utils.h"
-
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_in", "int", STORAGE)}
 ${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
-${layout_declare_ubo(B, "ivec4", "out_axis_map")}
-${layout_declare_ubo(B, "ivec4", "in_axis_map")}
-${layout_declare_ubo(B, "ivec4", "weight_axis_map")}
+
+#include "indexing_utils.h"
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+
+${layout_declare_spec_const(C, "int", "weight_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 weight_axis_map = unhash_axis_map(weight_layout);
 
 void main() {
   const ivec3 out_lpos = ivec3(gl_GlobalInvocationID);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
index be3901799f..f7d2770faf 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -15,8 +15,6 @@
 
 ${define_active_storage_type(STORAGE)}
 
-#include "indexing_utils.h"
-
 ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
@@ -24,11 +22,14 @@ layout(std430) buffer;
 ${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_map")}
+
+#include "indexing_utils.h"
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 axis_map = unhash_axis_map(t_layout);
+const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
   const ivec4 buf_indices = tidx_to_nchwi(
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 26342bcd2b..09f53fe779 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -232,12 +232,20 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
   imageStore(im, lpos_to_pos(lpos, axis_map), texel)
 #endif
 
-// Converts hashed axis mapping and packed dim to a ivec4
-// e.g. 0x000102, 2 -> ivec4(0, 1, 2, 2)
-// e.g. 0x010200, 1 -> ivec4(1, 2, 0, 1)
-#define UNHASH_AXIS_MAP(hash, packed_dim) \
-  ivec4(hash >> 16, (hash >> 8) & 0xFF, hash & 0xFF, packed_dim)
-#define DEFAULT_AXIS_MAP_HASH 0x000102
+/*
+ * Converts hashed layout to a ivec4 containing the axis map data and an int
+ * containing the packed dim respectively. Each value takes up 4 bits in the
+ * packed int, and values are read from least significant half byte (right-most)
+ * to most significant half byte (left-most).
+ * e.g. 0x20122, 2 -> ivec4(0, 1, 2, 2)
+ * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1)
+ */
+#define unhash_axis_map(hash) \
+  ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))
+
+#define unhash_packed_dim(hash) int(hash >> 16 & 0xf)
+
+#define DEFAULT_LAYOUT 0x02210
 
 /************************
  * Deprecated Functions *
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
index 03500b2d08..f984821600 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
@@ -31,13 +31,13 @@ ${layout_declare_ubo(B, "float", "epsilon")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-${layout_declare_spec_const(C, "int", "in_axis_map_hash", "DEFAULT_AXIS_MAP_HASH")}
-${layout_declare_spec_const(C, "int", "in_packed_dim", "C_DIM")}
-const ivec4 in_axis_map = UNHASH_AXIS_MAP(in_axis_map_hash, in_packed_dim);
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+const lowp int in_packed_dim = unhash_packed_dim(in_layout);
 
-${layout_declare_spec_const(C, "int", "out_axis_map_hash", "DEFAULT_AXIS_MAP_HASH")}
-${layout_declare_spec_const(C, "int", "out_packed_dim", "C_DIM")}
-const ivec4 out_axis_map = UNHASH_AXIS_MAP(out_axis_map_hash, out_packed_dim);
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int out_packed_dim = unhash_packed_dim(out_layout);
 
 void main() {
   const ivec3 lpos = ivec3(gl_GlobalInvocationID);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
index 8a3ef68528..25113887dc 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
@@ -23,11 +23,12 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "nchw_in", "int")}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 axis_map = unhash_axis_map(t_layout);
+const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 /*
  * Extends sign of int8
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
index ea4e0d300c..bf498f34d5 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -20,7 +20,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 // This constant is unused in this shader but is kept so that the signature is
 // consistent with nchw_to_image.
-layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
+${layout_declare_spec_const(C, "int", "UNUSED_layout", "0")}
 
 void main() {
   int out_bufi = int(gl_GlobalInvocationID.x);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
index b86a59fc23..bde846289e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -15,20 +15,21 @@
 
 ${define_active_storage_type(STORAGE)}
 
-#include "indexing_utils.h"
-
 ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_buffer(B, "r", "nchw_in", DTYPE)}
+${layout_declare_buffer(B, "r", "buf_in", DTYPE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
-${layout_declare_ubo(B, "ivec4", "axis_map")}
+
+#include "indexing_utils.h"
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
+${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 axis_map = unhash_axis_map(t_layout);
+const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 VEC4_T read_texel(ivec4 tidx) {
   const ivec4 buf_indices = tidx_to_nchwi(
@@ -38,16 +39,16 @@ VEC4_T read_texel(ivec4 tidx) {
 
   VEC4_T texel = VEC4_T(0);
   if (tidx[packed_dim] < sizes[packed_dim]) {
-    texel.x = SCALAR_T(nchw_in[buf_indices.x]);
+    texel.x = SCALAR_T(buf_in[buf_indices.x]);
   }
   if (tidx[packed_dim] + 1 < sizes[packed_dim]) {
-    texel.y = SCALAR_T(nchw_in[buf_indices.y]);
+    texel.y = SCALAR_T(buf_in[buf_indices.y]);
   }
   if (tidx[packed_dim] + 2 < sizes[packed_dim]) {
-    texel.z = SCALAR_T(nchw_in[buf_indices.z]);
+    texel.z = SCALAR_T(buf_in[buf_indices.z]);
   }
   if (tidx[packed_dim] + 3 < sizes[packed_dim]) {
-    texel.w = SCALAR_T(nchw_in[buf_indices.w]);
+    texel.w = SCALAR_T(buf_in[buf_indices.w]);
   }
   return texel;
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
index 3ade1f10cb..1a8e677a38 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
@@ -19,15 +19,19 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec3", "tin_limits")}
-${layout_declare_ubo(B, "ivec4", "tin_axis_map")}
-${layout_declare_ubo(B, "ivec4", "tout_axis_map")}
+
+#include "indexing_utils.h"
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int nrepeats = 1;
-layout(constant_id = 4) const int repeat_dim = 1;
+${layout_declare_spec_const(C, "int", "tout_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 tout_axis_map = unhash_axis_map(tout_layout);
 
-#include "indexing_utils.h"
+${layout_declare_spec_const(C, "int", "tin_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 tin_axis_map = unhash_axis_map(tin_layout);
+
+${layout_declare_spec_const(C, "int", "nrepeats", "1")}
+${layout_declare_spec_const(C, "int", "repeat_dim", "1")}
 
 void main() {
   const ivec3 tin_lpos = ivec3(gl_GlobalInvocationID);
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index c055431a84..33f73cd6da 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -84,15 +84,12 @@ void add_binary_op_node(
        {{arg1, arg2}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {t_out->sizes_ubo(),
-       t_out->axis_map_ubo(),
        t_in1->sizes_ubo(),
-       t_in1->axis_map_ubo(),
        t_in2->sizes_ubo(),
-       t_in2->axis_map_ubo(),
        graph.create_params_buffer(broadcast_params),
        graph.create_params_buffer(alpha_val)},
       // Specialization Constants
-      {SV(t_out->packed_dim())},
+      {t_out->hashed_layout(), t_in1->hashed_layout(), t_in2->hashed_layout()},
       // Resizing Logic
       resize_binary_op_node,
       {}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 43568622f8..880d48e25e 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -106,9 +106,9 @@ ValueRef prepack_biases(
       graph.create_local_wg_size(v),
       vref,
       v,
-      {t->sizes_ubo(), t->axis_map_ubo()},
+      {t->sizes_ubo()},
       // Specialization constants
-      {SV(t->packed_dim())}));
+      {t->hashed_layout()}));
 
   return v;
 }
@@ -479,15 +479,14 @@ void add_conv1d_node(
       {
           t_out->logical_limits_ubo(),
           t_in->sizes_ubo(),
-          t_out->axis_map_ubo(),
-          t_in->axis_map_ubo(),
-          t_weight->axis_map_ubo(),
-          t_bias->axis_map_ubo(),
           graph.create_params_buffer(kernel_params),
           graph.create_params_buffer(out_params),
       },
       // Specialization Constants
-      {},
+      {t_out->hashed_layout(),
+       t_in->hashed_layout(),
+       t_weight->hashed_layout(),
+       t_bias->hashed_layout()},
       // Resizing Logic
       resize_conv1d_node,
       {weight, stride, padding, dilation}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
index 18f337cb10..15cfce2a01 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -52,17 +52,15 @@ void add_copy_offset_node(
       graph.create_local_wg_size(out),
       // Inputs and Outputs
       {
-          {out, vkapi::MemoryAccessType::WRITE},
-          {in, vkapi::MemoryAccessType::READ},
+          {out, vkapi::kWrite},
+          {in, vkapi::kRead},
       },
       // Parameter buffers
       {
           graph.create_params_buffer(offset_params),
-          t_out->axis_map_ubo(),
-          t_in->axis_map_ubo(),
       },
       // Specialization Constants
-      {}));
+      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}));
 }
 
 void add_copy_channel_offset_node(
@@ -169,13 +167,11 @@ void add_copy_channel_offset_node(
         // Parameter buffers
         {
             t_out->sizes_ubo(),
-            t_out->axis_map_ubo(),
             t_in->sizes_ubo(),
-            t_in->axis_map_ubo(),
             graph.create_params_buffer(channel_offset_params),
         },
         // Specialization Constants
-        {}));
+        {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}));
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
index beaeed59ba..05ebd3d1a6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
@@ -46,14 +46,13 @@ void add_embedding_node(
       VK_KERNEL_FROM_STR(kernel_name),
       graph.create_global_wg_size(out),
       graph.create_local_wg_size(out),
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {{in, weight}, vkapi::MemoryAccessType::READ}},
+      {{out, vkapi::kWrite}, {{in, weight}, vkapi::kRead}},
       {
           t_out->sizes_ubo(),
-          t_out->axis_map_ubo(),
-          t_in->axis_map_ubo(),
-          t_weight->axis_map_ubo(),
-      }));
+      },
+      {t_out->hashed_layout(),
+       t_in->hashed_layout(),
+       t_weight->hashed_layout()}));
 }
 
 void embedding(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
index 74afce1abe..e2d6fc2551 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -119,20 +119,16 @@ void add_addmm_naive_node(
       {
           graph.sizes_ubo(out),
           graph.logical_limits_ubo(out),
-          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1),
-          graph.axis_map_ubo(mat1),
           graph.sizes_ubo(mat2),
-          graph.axis_map_ubo(mat2),
           graph.sizes_ubo(self),
-          graph.axis_map_ubo(self),
           graph.create_params_buffer(params),
       },
       // Specialization Constants
-      {graph.packed_dim_of(out),
-       graph.packed_dim_of(mat1),
-       graph.packed_dim_of(mat2),
-       graph.packed_dim_of(self)},
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(mat1),
+       graph.hashed_layout_of(mat2),
+       graph.hashed_layout_of(self)},
       // Resizing Logic
       resize_addmm_node,
       {mat2_is_transposed}));
@@ -215,17 +211,16 @@ void add_addmm_optimized_node(
       // Shader params buffers
       {
           graph.sizes_ubo(out),
-          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1_W_packed),
-          graph.axis_map_ubo(mat1_W_packed),
           graph.sizes_ubo(mat2_packed),
-          graph.axis_map_ubo(mat2_packed),
           graph.sizes_ubo(self),
-          graph.axis_map_ubo(self),
           graph.create_params_buffer(params),
       },
       // Specialization Constants
-      {graph.packed_dim_of(out)},
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(mat1_W_packed),
+       graph.hashed_layout_of(mat2_packed),
+       graph.hashed_layout_of(self)},
       // Resizing Logic
       resize_addmm_node,
       {mat2_is_transposed}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index 71e9033cec..8ca9858d88 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -135,16 +135,13 @@ void add_matmul_naive_texture3d_node(
       {
           graph.sizes_ubo(out),
           graph.logical_limits_ubo(out),
-          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1),
-          graph.axis_map_ubo(mat1),
           graph.sizes_ubo(mat2),
-          graph.axis_map_ubo(mat2),
       },
       // Specialization Constants
-      {graph.packed_dim_of(out),
-       graph.packed_dim_of(mat1),
-       graph.packed_dim_of(mat2)},
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(mat1),
+       graph.hashed_layout_of(mat2)},
       // Resizing Logic
       resize_matmul_node,
       {mat2_is_transposed}));
@@ -224,14 +221,13 @@ void add_matmul_optimized_node(
       // Shader params buffers
       {
           graph.sizes_ubo(out),
-          graph.axis_map_ubo(out),
           graph.sizes_ubo(mat1_W_packed),
-          graph.axis_map_ubo(mat1_W_packed),
           graph.sizes_ubo(mat2_packed),
-          graph.axis_map_ubo(mat2_packed),
       },
       // Specialization Constants
-      {graph.packed_dim_of(out)},
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(mat1_W_packed),
+       graph.hashed_layout_of(mat2_packed)},
       // Resizing Logic
       resize_matmul_node,
       {mat2_is_transposed}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
index 1509f35014..b1cc8c8084 100644
--- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
@@ -113,10 +113,8 @@ void add_native_layer_norm_node(
       },
       // Specialization Constants
       {
-          hash_axis_map(t_input->axis_map()),
-          t_input->packed_dim(),
-          hash_axis_map(t_out->axis_map()),
-          t_out->packed_dim(),
+          t_input->hashed_layout(),
+          t_out->hashed_layout(),
       },
       // Resizing Logic
       resize_native_layer_norm_node,
diff --git a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
index 16c1366456..5e4608a65b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
@@ -63,11 +63,12 @@ void add_repeat_interleave_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
       // Parameter buffers
-      {graph.logical_limits_ubo(in),
-       graph.axis_map_ubo(in),
-       graph.axis_map_ubo(out)},
+      {graph.logical_limits_ubo(in)},
       // Specialization Constants
-      {nrepeats, repeat_dim},
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(in),
+       nrepeats,
+       repeat_dim},
       // Resizing Logic
       resize_repeat_interleave_node,
       {num_repeats, dim}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 15045ccca2..80a1e706e8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -35,7 +35,7 @@ void add_staging_to_tensor_node(
          graph.strides_ubo(out_tensor),
          graph.numel_ubo(out_tensor)});
   } else {
-    ubos.append({graph.sizes_ubo(out_tensor), graph.axis_map_ubo(out_tensor)});
+    ubos.append({graph.sizes_ubo(out_tensor)});
   }
 
   graph.execute_nodes().emplace_back(new DispatchNode(
@@ -44,12 +44,11 @@ void add_staging_to_tensor_node(
       graph.create_global_wg_size(out_tensor),
       graph.create_local_wg_size(out_tensor),
       // Input and Outputs
-      {{out_tensor, vkapi::MemoryAccessType::WRITE},
-       {in_staging, vkapi::MemoryAccessType::READ}},
+      {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}},
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {SV(graph.packed_dim_of(out_tensor))},
+      {graph.hashed_layout_of(out_tensor)},
       // Resizing Logic
       nullptr,
       {}));
@@ -81,7 +80,7 @@ void add_tensor_to_staging_node(
          graph.strides_ubo(in_tensor),
          graph.numel_ubo(in_tensor)});
   } else {
-    ubos.append({graph.sizes_ubo(in_tensor), graph.axis_map_ubo(in_tensor)});
+    ubos.append({graph.sizes_ubo(in_tensor)});
   }
 
   // Normally, the image_to_nchw shader is structured so that each thread reads
@@ -104,12 +103,11 @@ void add_tensor_to_staging_node(
       global_wg_size,
       graph.create_local_wg_size(global_wg_size),
       // Input and Outputs
-      {{out_staging, vkapi::MemoryAccessType::WRITE},
-       {in_tensor, vkapi::MemoryAccessType::READ}},
+      {{out_staging, vkapi::kWrite}, {in_tensor, vkapi::kRead}},
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {SV(graph.packed_dim_of(in_tensor))}));
+      {graph.hashed_layout_of(in_tensor)}));
 }
 
 void add_prepack_standard_node(
@@ -126,7 +124,7 @@ void add_prepack_standard_node(
          graph.strides_ubo(tensor),
          graph.numel_ubo(tensor)});
   } else {
-    ubos.append({graph.sizes_ubo(tensor), graph.axis_map_ubo(tensor)});
+    ubos.append({graph.sizes_ubo(tensor)});
   }
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
@@ -140,7 +138,7 @@ void add_prepack_standard_node(
       // Parameter Buffers
       ubos,
       // Specialization Constants
-      {SV(graph.packed_dim_of(tensor))}));
+      {graph.hashed_layout_of(tensor)}));
 }
 
 ValueRef prepack_standard(
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
index 508cc2538a..c9eeb0efe0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
@@ -79,18 +79,4 @@ T nchw_dim_to_whcn_dim(const T& nchw_dim, const int64_t ndim) {
   return ndim - 1 - nchw_dim;
 }
 
-//
-// Tensor axis map utilities
-//
-
-// Converts ivec4 axis map to a single int32_t, to be able to pass it as a
-// specialization constant instead of a ubo. This allows for the spir-v to
-// bytecode compilation to perform compile-time folding on the axis map.
-// Only converts the first 3 indices, as the last index is the packed dim,
-// which is passed separately.
-// Example: ivec4(0, 1, 2, 2) -> 0x000102
-inline int32_t hash_axis_map(const std::vector<int64_t>& axis_map) {
-  return (axis_map.at(0) << 16) + (axis_map.at(1) << 8) + axis_map.at(2);
-}
-
 } // namespace vkcompute
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 73e2f049a3..6124f0b71e 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -70,7 +70,7 @@ void record_nchw_to_image_op(
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst) {
   vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {SV(v_dst.packed_dim())};
+  vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};
 
   context->submit_compute_job(
       get_nchw_to_tensor_shader(
@@ -86,8 +86,7 @@ void record_nchw_to_image_op(
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
-      v_dst.sizes_ubo(),
-      v_dst.axis_map_ubo());
+      v_dst.sizes_ubo());
 }
 
 void record_image_to_nchw_op(
@@ -95,7 +94,7 @@ void record_image_to_nchw_op(
     api::vTensor& v_src,
     vkapi::VulkanBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {SV(v_src.packed_dim())};
+  vkapi::SpecVarList specialization_constants = {v_src.hashed_layout()};
 
   context->submit_compute_job(
       get_tensor_to_nchw_shader(v_src),
@@ -107,8 +106,7 @@ void record_image_to_nchw_op(
       0,
       dst_buffer,
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_src.sizes_ubo(),
-      v_src.axis_map_ubo());
+      v_src.sizes_ubo());
 }
 
 void record_bitw8_image_to_nchw_nobitw8buffer_op(
@@ -128,13 +126,12 @@ void record_bitw8_image_to_nchw_nobitw8buffer_op(
       pipeline_barrier,
       global_wg_size,
       adaptive_work_group_size(global_wg_size),
-      {v_src.packed_dim()},
+      {v_src.hashed_layout()},
       VK_NULL_HANDLE,
       0,
       dst_buffer.buffer(),
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
-      v_src.axis_map_ubo(),
       v_src.numel_ubo());
 }
 
@@ -337,7 +334,7 @@ void record_matmul_texture3d(
       pipeline_barrier,
       global_wg_size,
       {8, 8, 1},
-      {out.packed_dim(), mat1.packed_dim(), mat2.packed_dim()},
+      {out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()},
       VK_NULL_HANDLE,
       0,
       out.image(
@@ -348,11 +345,8 @@ void record_matmul_texture3d(
       mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       out.sizes_ubo(),
       out.logical_limits_ubo(),
-      out.axis_map_ubo(),
       mat1.sizes_ubo(),
-      mat1.axis_map_ubo(),
-      mat2.sizes_ubo(),
-      mat2.axis_map_ubo());
+      mat2.sizes_ubo());
 }
 
 //
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 87cafd10a7..1d40fe1bb5 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1588,9 +1588,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 4);
 
   // +2: t.sizes_ubo() for each staging shader
-  // +2: t.axis_map_ubo() for each staging shader
   // +2: staging buffer for each input tensor
-  expected_vma_allocation_count += 6;
+  expected_vma_allocation_count += 4;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef c = graph.add_tensor(
@@ -1603,8 +1602,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() for arithmetic shader output c
-  // +1: t.axis_map_ubo() for arithmetic shader output c
-  expected_vma_allocation_count += 4;
+  expected_vma_allocation_count += 3;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   IOValueRef d = graph.add_input_tensor(
@@ -1613,9 +1611,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 2);
 
   // +1: t.sizes_ubo() uniform buffer for staging shader
-  // +1: t.axis_map_ubo() uniform buffer for staging shader
   // +1: staging buffer for the input tensor
-  expected_vma_allocation_count += 3;
+  expected_vma_allocation_count += 2;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef e = graph.add_tensor(
@@ -1628,8 +1625,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() for arithmetic shader output e
-  // +1: t.axis_map_ubo() for arithmetic shader output e
-  expected_vma_allocation_count += 4;
+  expected_vma_allocation_count += 3;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   IOValueRef out = {};
diff --git a/build/run_android_emulator.sh b/build/run_android_emulator.sh
index aa63e2a86a..62d5ea767f 100755
--- a/build/run_android_emulator.sh
+++ b/build/run_android_emulator.sh
@@ -18,5 +18,10 @@ $ADB_PATH wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; d
 echo "List all running emulators"
 $ADB_PATH devices
 
-# TODO: Run tests on emulator here, atm the script only boots up the emulator
-# and exits without doing anything yet
+adb install -t app-debug.apk
+adb install -t app-debug-androidTest.apk
+
+adb shell mkdir -p /data/local/tmp/llama
+adb push model.pte /data/local/tmp/llama
+adb push tokenizer.bin /data/local/tmp/llama
+adb shell am instrument -w -r com.example.executorchllamademo.test/androidx.test.runner.AndroidJUnitRunner
diff --git a/docs/source/executorch-arm-delegate-tutorial.md b/docs/source/executorch-arm-delegate-tutorial.md
index 59cb28b497..25b5551b5e 100644
--- a/docs/source/executorch-arm-delegate-tutorial.md
+++ b/docs/source/executorch-arm-delegate-tutorial.md
@@ -13,7 +13,7 @@
 
 :::{grid-item-card}  What you will learn in this tutorial:
 :class-card: card-prerequisites
-In this tutorial you will learn how to export a simple PyTorch model for ExecuTorch Arm Ethos-u backend delegate and run it on a Corstone-300 FVP Simulator.
+In this tutorial you will learn how to export a simple PyTorch model for ExecuTorch Arm Ethos-u backend delegate and run it on a Corstone FVP Simulators.
 :::
 
 ::::
@@ -34,9 +34,9 @@ Let's make sure you have everything you need before we get started.
 
 To successfully complete this tutorial, you will need a Linux-based host machine with Arm aarch64 or x86_64 processor architecture.
 
-The target device will be an embedded platform with an Arm Cortex-M55 CPU and Ethos-U55 NPU (ML processor). This tutorial will show you how to run PyTorch models on both.
+The target device will be an embedded platform with an Arm Cortex-M CPUs and Ethos-U NPUs (ML processor). This tutorial will show you how to run PyTorch models on both.
 
-We will be using a [Fixed Virtual Platform (FVP)](https://www.arm.com/products/development-tools/simulation/fixed-virtual-platforms), simulating a [Corstone-300](https://developer.arm.com/Processors/Corstone-300)(cs300) system. Since we will be using the FVP (think of it as virtual hardware), we won't be requiring any real embedded hardware for this tutorial.
+We will be using a [Fixed Virtual Platform (FVP)](https://www.arm.com/products/development-tools/simulation/fixed-virtual-platforms), simulating [Corstone-300](https://developer.arm.com/Processors/Corstone-300)(cs300) and [Corstone-320](https://developer.arm.com/Processors/Corstone-320)(cs320)systems. Since we will be using the FVP (think of it as virtual hardware), we won't be requiring any real embedded hardware for this tutorial.
 
 ### Software
 
@@ -64,19 +64,19 @@ uname -m
 
 Next we will walk through the steps performed by the `setup.sh` script to better understand the development setup.
 
-### Download and Set Up the Corstone-300 FVP
+### Download and Set Up the Corstone-300 and Corstone-320 FVP
 
-Fixed Virtual Platforms (FVPs) are pre-configured, functionally accurate simulations of popular system configurations. Here in this tutorial, we are interested in the Corstone-300 system. We can download this from the Arm website.
+Fixed Virtual Platforms (FVPs) are pre-configured, functionally accurate simulations of popular system configurations. Here in this tutorial, we are interested in Corstone-300 and Corstone-320 systems. We can download this from the Arm website.
 
 ```{note}
  By downloading and running the FVP software, you will be agreeing to the FVP [End-user license agreement (EULA)](https://developer.arm.com/downloads/-/arm-ecosystem-fvps/eula).
 ```
 
-To download, we can either download `Corstone-300 Ecosystem FVP` from [here](https://developer.arm.com/downloads/-/arm-ecosystem-fvps). or `setup.sh` script will does that for you under `setup_fvp` function.
+To download, we can either download `Corstone-300 Ecosystem FVP` and `Corstone-320 Ecosystem FVP`from [here](https://developer.arm.com/downloads/-/arm-ecosystem-fvps). or `setup.sh` script does that for you under `setup_fvp` function.
 
 ### Download and Install the Arm GNU AArch32 Bare-Metal Toolchain
 
-Similar to the FVP, we would also need a tool-chain to cross-compile ExecuTorch runtime, executor-runner bare-metal application, as well as the rest of the bare-metal stack for Cortex-M55 CPU available on the Corstone-300 platform.
+Similar to the FVP, we would also need a tool-chain to cross-compile ExecuTorch runtime, executor-runner bare-metal application, as well as the rest of the bare-metal stack for Cortex-M55/M85 CPU available on the Corstone-300/Corstone-320 platform.
 
 These toolchains are available [here](https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads). We will be using GCC 12.3 targeting `arm-none-eabi` here for our tutorial. Just like FVP, `setup.sh` script will down the toolchain for you. See `setup_toolchain` function.
 
@@ -103,10 +103,14 @@ At the end of the setup, if everything goes well, your top level devlopement dir
 │   ├── fetch_externals.py
 │   └── [...]
 ├── ethos-u-vela
-├── FVP
+├── FVP-corstone300
 │   ├── FVP_Corstone_SSE-300.sh
 │   └── [...]
+├── FVP-corstone320
+│   ├── FVP_Corstone_SSE-320.sh
+│   └── [...]
 ├── FVP_cs300.tgz
+├── FVP_cs320.tgz
 ├── gcc.tar.xz
 └── reference_model
 ```
@@ -239,8 +243,7 @@ cmake -DCMAKE_BUILD_TYPE=Release \
 -Bcmake-out-aot-lib \
     "${et_root_dir}"
 
-n=$(nproc)
-cmake --build cmake-out-aot-lib -j"$((n - 5))" -- quantized_ops_aot_lib
+cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
 ```
 
 After the `quantized_ops_aot_lib` build, we can run the following script to generate the `.pte` file
@@ -257,7 +260,7 @@ At the end of this, we should have three different `.pte` files.
 - The second one contains the [AddModule](#addmodule), with Arm Ethos-U backend delegate enabled.
 - The third one contains the [quantized MV2Model](#mv2module), with the Arm Ethos-U backend delegate enabled as well.
 
-Now let's try to run these `.pte` files on a Corstone-300 platform in a bare-metal environment.
+Now let's try to run these `.pte` files on a Corstone-300 and Corstone-320 platforms in a bare-metal environment.
 
 ## Getting a Bare-Metal Executable
 
@@ -269,9 +272,13 @@ The block diagram below demonstrates, at the high level, how the various build a
 
 ![](./arm-delegate-runtime-build.svg)
 
+```{tip}
+The `generate_pte_file` function in `run.sh` script produces the `.pte` files based on the models provided through `--model_name` input argument
+```
+
 ### Generating ExecuTorch Libraries
 
-ExecuTorch's CMake build system produces a set of build pieces which are critical for us to include and run the ExecuTorch runtime with-in the bare-metal environment we have for Corstone-300 from Ethos-U SDK.
+ExecuTorch's CMake build system produces a set of build pieces which are critical for us to include and run the ExecuTorch runtime with-in the bare-metal environment we have for Corstone FVPs from Ethos-U SDK.
 
 [This](./runtime-build-and-cross-compilation.md) document provides a detailed overview of each individual build piece. For running either variant of the `.pte` file, we will need a core set of libraries. Here is a list,
 
@@ -283,133 +290,106 @@ To run a `.pte` file with the Arm backend delegate call instructions, we will ne
 
 - `libexecutorch_delegate_ethos_u.a`
 
-
-These libraries are generated in `build_executorch` function of the `run.sh` script.
+These libraries are generated in `build_executorch` and `build_quantization_aot_lib` function of the `run.sh` script.
 
 In this function, `EXECUTORCH_SELECT_OPS_LIST` will decide the number of portable operators included in the build and are available at runtime. It must match with `.pte` file's requirements, otherwise you will get `Missing Operator` error at runtime.
 
 For example, there  in the command line above, to run SoftmaxModule, we only included the softmax CPU operator. Similarly, to run AddModule in a non-delegated manner you will need add op and so on. As you might have already realized, for the delegated operators, which will be executed by the Arm backend delegate, we do not need to include those operators in this list. This is only for *non-delegated* operators.
 
+```{tip}
+The `run.sh` script takes in `--portable_kernels` option, which provides a way to supply a comma seperated list of portable kernels to be included.
+```
+
 ### Building the executor_runner Bare-Metal Application
 
 The SDK dir is the same one prepared [earlier](#setup-the-arm-ethos-u-software-development). And, we will be passing the `.pte` file (any one of them) generated above.
 
-Note, you have to generate a new `executor-runner` binary if you want to change the model or the `.pte` file. This constraint is from the constrained bare-metal runtime environment we have for Corstone-300 platform.
+Note, you have to generate a new `executor-runner` binary if you want to change the model or the `.pte` file. This constraint is from the constrained bare-metal runtime environment we have for Corstone-300/Corstone-320 platforms.
 
 This is performed by the `build_executorch_runner` function in `run.sh`.
 
-## Running on Corstone-300 FVP Platform
+```{tip}
+The `run.sh` script takes in `--target` option, which provides a way to provide a specific target, Corstone-300(ethos-u55-128) or Corstone-320(ethos-u85-128)
+```
+
+## Running on Corstone FVP Platforms
 
-Once the elf is prepared, regardless of the `.pte` file variant is used to generate the bare metal elf, you can run in with following command,
+Once the elf is prepared, regardless of the `.pte` file variant is used to generate the bare metal elf. The below command is used to run the [MV2Model](#mv2module) on Corstone-320 FVP
 
 ```bash
 ethos_u_build_dir=examples/arm/executor_runner/
 
 elf=$(find ${ethos_u_build_dir} -name "arm_executor_runner")
 
-FVP_Corstone_SSE-300_Ethos-U55                          \
-    -C ethosu.num_macs=128                              \
-    -C mps3_board.visualisation.disable-visualisation=1 \
-    -C mps3_board.telnetterminal0.start_telnet=0        \
-    -C mps3_board.uart0.out_file='-'                    \
+FVP_Corstone_SSE-320_Ethos-U85                          \
+    -C mps4_board.subsystem.cpu0.CFGITCMSZ=11           \
+    -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
+    -C mps4_board.visualisation.disable-visualisation=1 \
+    -C vis_hdlcd.disable_visualisation=1                \
+    -C mps4_board.telnetterminal0.start_telnet=0        \
+    -C mps4_board.uart0.out_file='-'                    \
+    -C mps4_board.uart0.shutdown_on_eot=1               \
     -a "${elf}"                                         \
-    --timelimit 10 # seconds - after which sim will kill itself
+    --timelimit 120 || true # seconds- after which sim will kill itself
 ```
 
 If successful, the simulator should produce something like the following on the shell,
 
 ```console
-    Ethos-U rev 136b7d75 --- Apr 12 2023 13:44:01
-    (C) COPYRIGHT 2019-2023 Arm Limited
-    ALL RIGHTS RESERVED
-
-I executorch:runner.cpp:64] Model PTE file loaded. Size: 960 bytes.
-I executorch:runner.cpp:70] Model buffer loaded, has 1 methods
-I executorch:runner.cpp:78] Running method forward
-I executorch:runner.cpp:95] Setting up planned buffer 0, size 32.
-I executorch:runner.cpp:110] Method loaded.
-I executorch:runner.cpp:112] Preparing inputs...
-I executorch:runner.cpp:114] Input prepared.
-I executorch:runner.cpp:116] Starting the model execution...
-I executorch:runner.cpp:121] Model executed successfully.
-I executorch:runner.cpp:125] 1 outputs:
-Output[0][0]: 0.500000
-Output[0][1]: 0.500000
-Output[0][2]: 0.500000
-Output[0][3]: 0.500000
-Application exit code: 0.
-
-EXITTHESIM
-
-Info: Simulation is stopping. Reason: CPU time has been exceeded.
-```
-
-Here in this example, we ran the `executor_runner` binary with the `softmax.pte` file generated for the [SoftmaxModule](#softmaxmodule), we do see the expected results generated from the baremetal binary running on the Corstone-300 virtual hardware on FVP simulator.
-
-If you rerun the same FVP command with the delegated `.pte` file for the [AddModule](#addmodule), i.e. `add_arm_delegate.pte` - you may get something like following, again the expected results. Pay attention to the messages printed with prefix `ArmBackend::`, they indicate that the backend was sucecssfully initialized and the `add` operator from our AddModule in the `.pte` was exexuted on the Ethos-U55 NPU.
-
-```console
-    Ethos-U rev 136b7d75 --- Apr 12 2023 13:44:01
-    (C) COPYRIGHT 2019-2023 Arm Limited
-    ALL RIGHTS RESERVED
-
-I executorch:runner.cpp:64] Model PTE file loaded. Size: 2208 bytes.
-I executorch:runner.cpp:70] Model buffer loaded, has 1 methods
-I executorch:runner.cpp:78] Running method forward
-I executorch:runner.cpp:95] Setting up planned buffer 0, size 64.
-I executorch:ArmBackendEthosU.cpp:51] ArmBackend::init 0x11000050
-I executorch:runner.cpp:110] Method loaded.
-I executorch:runner.cpp:112] Preparing inputs...
-I executorch:runner.cpp:114] Input prepared.
-I executorch:runner.cpp:116] Starting the model execution...
-I executorch:ArmBackendEthosU.cpp:103] ArmBackend::execute 0x11000050
-I executorch:runner.cpp:121] Model executed successfully.
-I executorch:runner.cpp:125] 1 outputs:
-Output[0][0]: 2
-Output[0][1]: 2
-Output[0][2]: 2
-Output[0][3]: 2
-Output[0][4]: 2
-Application exit code: 0.
-
-EXITTHESIM
-
-Info: Simulation is stopping. Reason: CPU time has been exceeded.
-```
-
-Similarily we can get the following output for running the [MV2Model](#mv2module)
-
-```
-    Ethos-U rev 136b7d75 --- Apr 12 2023 13:44:01
-    (C) COPYRIGHT 2019-2023 Arm Limited
-    ALL RIGHTS RESERVED
-
-I executorch:arm_executor_runner.cpp:60] Model in 0x70000000 $
-I executorch:arm_executor_runner.cpp:66] Model PTE file loaded. Size: 4556832 bytes.
-I executorch:arm_executor_runner.cpp:77] Model buffer loaded, has 1 methods
-I executorch:arm_executor_runner.cpp:85] Running method forward
-I executorch:arm_executor_runner.cpp:109] Setting up planned buffer 0, size 752640.
-I executorch:ArmBackendEthosU.cpp:49] ArmBackend::init 0x70000060
-I executorch:arm_executor_runner.cpp:130] Method loaded.
-I executorch:arm_executor_runner.cpp:132] Preparing inputs...
-I executorch:arm_executor_runner.cpp:141] Input prepared.
-I executorch:arm_executor_runner.cpp:143] Starting the model execution...
-I executorch:ArmBackendEthosU.cpp:87] ArmBackend::execute 0x70000060
-I executorch:ArmBackendEthosU.cpp:234] Tensor input 0 will be permuted
+I [executorch:arm_executor_runner.cpp:364] Model in 0x70000000 $
+I [executorch:arm_executor_runner.cpp:366] Model PTE file loaded. Size: 4425968 bytes.
+I [executorch:arm_executor_runner.cpp:376] Model buffer loaded, has 1 methods
+I [executorch:arm_executor_runner.cpp:384] Running method forward
+I [executorch:arm_executor_runner.cpp:395] Setup Method allocator pool. Size: 62914560 bytes.
+I [executorch:arm_executor_runner.cpp:412] Setting up planned buffer 0, size 752640.
+I [executorch:ArmBackendEthosU.cpp:79] ArmBackend::init 0x70000070
+I [executorch:arm_executor_runner.cpp:445] Method loaded.
+I [executorch:arm_executor_runner.cpp:447] Preparing inputs...
+I [executorch:arm_executor_runner.cpp:461] Input prepared.
+I [executorch:arm_executor_runner.cpp:463] Starting the model execution...
+I [executorch:ArmBackendEthosU.cpp:118] ArmBackend::execute 0x70000070
+I [executorch:ArmBackendEthosU.cpp:298] Tensor input/output 0 will be permuted
+I [executorch:arm_perf_monitor.cpp:120] NPU Inferences : 1
+I [executorch:arm_perf_monitor.cpp:121] Profiler report, CPU cycles per operator:
+I [executorch:arm_perf_monitor.cpp:125] ethos-u : cycle_cnt : 1498202 cycles
+I [executorch:arm_perf_monitor.cpp:132] Operator(s) total: 1498202 CPU cycles
+I [executorch:arm_perf_monitor.cpp:138] Inference runtime: 6925114 CPU cycles total
+I [executorch:arm_perf_monitor.cpp:140] NOTE: CPU cycle values and ratio calculations require FPGA and identical CPU/NPU frequency
+I [executorch:arm_perf_monitor.cpp:149] Inference CPU ratio: 99.99 %
+I [executorch:arm_perf_monitor.cpp:153] Inference NPU ratio: 0.01 %
+I [executorch:arm_perf_monitor.cpp:162] cpu_wait_for_npu_cntr : 729 CPU cycles
+I [executorch:arm_perf_monitor.cpp:167] Ethos-U PMU report:
+I [executorch:arm_perf_monitor.cpp:168] ethosu_pmu_cycle_cntr : 5920305
+I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr0 : 359921
+I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr1 : 0
+I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr2 : 0
+I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr3 : 503
+I [executorch:arm_perf_monitor.cpp:178] Ethos-U PMU Events:[ETHOSU_PMU_EXT0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]
+I [executorch:arm_executor_runner.cpp:470] model_pte_loaded_size:     4425968 bytes.
+I [executorch:arm_executor_runner.cpp:484] method_allocator_used:     1355722 / 62914560  free: 61558838 ( used: 2 % ) 
+I [executorch:arm_executor_runner.cpp:491] method_allocator_planned:  752640 bytes
+I [executorch:arm_executor_runner.cpp:493] method_allocator_loaded:   966 bytes
+I [executorch:arm_executor_runner.cpp:494] method_allocator_input:    602116 bytes
+I [executorch:arm_executor_runner.cpp:495] method_allocator_executor: 0 bytes
+I [executorch:arm_executor_runner.cpp:498] temp_allocator_used:       0 / 1048576 free: 1048576 ( used: 0 % ) 
 I executorch:arm_executor_runner.cpp:152] Model executed successfully.
 I executorch:arm_executor_runner.cpp:156] 1 outputs:
-Output[0][0]: -0.639322
-Output[0][1]: 0.169232
-Output[0][2]: -0.451286
+Output[0][0]: -0.749744
+Output[0][1]: -0.019224
+Output[0][2]: 0.134570
 ...(Skipped)
-Output[0][996]: 0.150429
-Output[0][997]: -0.488894
-Output[0][998]: 0.037607
-Output[0][999]: 1.203430
+Output[0][996]: -0.230691
+Output[0][997]: -0.634399
+Output[0][998]: -0.115345
+Output[0][999]: 1.576386
 I executorch:arm_executor_runner.cpp:177] Program complete, exiting.
 I executorch:arm_executor_runner.cpp:179]
 ```
 
+```{note}
+The `run.sh` script provides various options to select a particular FVP target, use desired models, select portable kernels and can be explored using the `--help` argument
+```
+
 ## Takeaways
 Through this tutorial we've learnt how to use the ExecuTorch software to both export a standard model from PyTorch and to run it on the compact and fully functioned ExecuTorch runtime, enabling a smooth path for offloading models from PyTorch to Arm based platforms.
 
diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py
index a83f1695f6..53316ea200 100644
--- a/examples/apple/coreml/scripts/export.py
+++ b/examples/apple/coreml/scripts/export.py
@@ -192,7 +192,8 @@ def main():
             example_inputs,
         )
 
-    save_executorch_program(exec_program, args.model_name, args.compute_unit)
+    model_name = f"{args.model_name}_compiled" if args.compile else args.model_name
+    save_executorch_program(exec_program, model_name, args.compute_unit)
     generate_etrecord(f"{args.model_name}_coreml_etrecord.bin", edge_copy, exec_program)
 
     if args.save_processed_bytes and lowered_module is not None:
diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
index 4d8dd0b91f..358ad37b72 100644
--- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
@@ -654,7 +654,7 @@ Error Runner::mem_alloc(size_t alignment, size_t seq_len) {
   // Reset and re-init again to trigger registered function
   module_.reset();
   module_ = std::make_unique<Module>(
-      model_path_, Module::LoadMode::MmapUseMlockIgnoreErrors),
+      model_path_, Module::LoadMode::MmapUseMlockIgnoreErrors);
   ET_CHECK_MSG(load() == Error::Ok, "Runner failed to load method");
 
   return Error::Ok;
diff --git a/exir/pass_base.py b/exir/pass_base.py
index 3b1a2928e2..db6bef8e3f 100644
--- a/exir/pass_base.py
+++ b/exir/pass_base.py
@@ -453,7 +453,7 @@ def on_attr(self, attr: ProxyValue) -> None:
     def placeholder(self, name: str, arg: Argument, meta: NodeMetadata) -> ProxyValue:
         arg_proxy = self.tracer.create_proxy("placeholder", name, (), {})
         arg_proxy.node.meta = meta.data
-        self.tracer.set_metadata(arg_proxy.node, arg)
+        arg_proxy.node.meta["val"] = arg
         return ProxyValue(arg, arg_proxy)
 
     def call_operator(
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index a05d789a80..8e74a508f3 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -144,10 +144,12 @@ class BackendDelegate final {
       CompileSpec** out_spec) {
     auto number_of_compile_specs = compile_specs_in_program->size();
 
-    CompileSpec* compile_specs_list = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-        backend_init_context.get_runtime_allocator(),
-        CompileSpec,
-        number_of_compile_specs);
+    CompileSpec* compile_specs_list =
+        backend_init_context.get_runtime_allocator()->allocateList<CompileSpec>(
+            number_of_compile_specs);
+    if (compile_specs_list == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
 
     // Initialize the spec list for each method spec
     for (size_t j = 0; j < number_of_compile_specs; j++) {
@@ -226,8 +228,10 @@ Result<InstructionArgs> gen_instruction_arguments(
     EValue* values,
     size_t num_args,
     const int32_t* arg_idxs) {
-  EValue** arg_list =
-      ET_ALLOCATE_LIST_OR_RETURN_ERROR(method_allocator, EValue*, num_args);
+  EValue** arg_list = method_allocator->allocateList<EValue*>(num_args);
+  if (arg_list == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
   for (size_t i = 0; i < num_args; ++i) {
     int32_t arg_idx = arg_idxs[i];
     ET_CHECK_OR_RETURN_ERROR(
@@ -287,8 +291,10 @@ Error Method::parse_values() {
   ET_CHECK_OR_RETURN_ERROR(
       flatbuffer_values != nullptr, InvalidProgram, "Missing values");
   size_t n_value = flatbuffer_values->size();
-  values_ = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-      memory_manager_->method_allocator(), EValue, n_value);
+  values_ = memory_manager_->method_allocator()->allocateList<EValue>(n_value);
+  if (values_ == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
 
   // n_value_ counts the number of successfully-initialized values for ~Method()
   // to clean up, and is incremented at the bottom of the loop. This makes it
@@ -510,8 +516,11 @@ Error Method::resolve_operator(
 
   // resolve tensor meta
   auto method_allocator = memory_manager_->method_allocator();
-  TensorMeta* meta =
-      ET_ALLOCATE_LIST_OR_RETURN_ERROR(method_allocator, TensorMeta, n_args);
+  TensorMeta* meta = method_allocator->allocateList<TensorMeta>(n_args);
+  if (meta == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
+
   size_t count = 0;
   for (size_t i = 0; i < n_args; i++) {
     EValue* eval = args[i];
@@ -519,8 +528,11 @@ Error Method::resolve_operator(
     if (eval->isTensor()) {
       auto tensor = eval->toTensor();
       meta[count].dtype_ = tensor.scalar_type();
-      exec_aten::DimOrderType* dim_order_ptr = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-          method_allocator, exec_aten::DimOrderType, tensor.dim());
+      exec_aten::DimOrderType* dim_order_ptr =
+          method_allocator->allocateList<exec_aten::DimOrderType>(tensor.dim());
+      if (dim_order_ptr == nullptr) {
+        return Error::MemoryAllocationFailed;
+      }
       size_t size = tensor.dim();
       err = get_dim_order(tensor, dim_order_ptr, size);
       ET_CHECK_OR_RETURN_ERROR(
@@ -554,8 +566,11 @@ Result<Method> Method::load(
   MemoryAllocator* temp_allocator = memory_manager->temp_allocator();
   if (temp_allocator == nullptr) {
     PlatformMemoryAllocator* platform_allocator =
-        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-            memory_manager->method_allocator(), PlatformMemoryAllocator);
+        memory_manager->method_allocator()
+            ->allocateInstance<PlatformMemoryAllocator>();
+    if (platform_allocator == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
     new (platform_allocator) PlatformMemoryAllocator();
     temp_allocator = platform_allocator;
   }
@@ -599,8 +614,10 @@ Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
     ET_CHECK_OR_RETURN_ERROR(
         delegates != nullptr, InvalidProgram, "Missing delegates field");
     size_t n_delegate = delegates->size();
-    delegates_ = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-        method_allocator, BackendDelegate, n_delegate);
+    delegates_ = method_allocator->allocateList<BackendDelegate>(n_delegate);
+    if (delegates_ == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
 
     // n_delegate_ counts the number of successfully-initialized delegates for
     // ~Method() to clean up, and is incremented at the bottom of the loop. This
@@ -628,8 +645,10 @@ Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
     ET_CHECK_OR_RETURN_ERROR(
         chains != nullptr && chains->size() > 0, InvalidProgram, "No chains");
     n_chains_ = chains->size();
-    chains_ =
-        ET_ALLOCATE_LIST_OR_RETURN_ERROR(method_allocator, Chain, n_chains_);
+    chains_ = method_allocator->allocateList<Chain>(n_chains_);
+    if (chains_ == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
 
     // Try resolving all operators before failing, to make it easier to debug
     // multiple problems at once.
@@ -644,10 +663,16 @@ Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
           "Missing instructions in chain %zu",
           i);
       auto num_instructions = s_instructions->size();
-      auto chain_instruction_kernels = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-          method_allocator, OpFunction, num_instructions);
-      auto chain_instruction_arg_lists = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-          method_allocator, InstructionArgs, num_instructions);
+      auto chain_instruction_kernels =
+          method_allocator->allocateList<OpFunction>(num_instructions);
+      if (chain_instruction_kernels == nullptr) {
+        return Error::MemoryAllocationFailed;
+      }
+      auto chain_instruction_arg_lists =
+          method_allocator->allocateList<InstructionArgs>(num_instructions);
+      if (chain_instruction_arg_lists == nullptr) {
+        return Error::MemoryAllocationFailed;
+      }
 
       // Set up the argument lists ahead of time and store pointers to them to
       // use when the instructions are called
diff --git a/runtime/executor/tensor_parser.h b/runtime/executor/tensor_parser.h
index 1d860bfc30..d0a818a721 100644
--- a/runtime/executor/tensor_parser.h
+++ b/runtime/executor/tensor_parser.h
@@ -37,13 +37,18 @@ parseListOptionalType(
     const flatbuffers::Vector<int32_t>* value_indices,
     EValue* values_,
     MemoryManager* memory_manager) {
-  auto* evalp_list = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-      memory_manager->method_allocator(), EValue*, value_indices->size());
-
-  auto* optional_tensor_list = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-      memory_manager->method_allocator(),
-      executorch::aten::optional<T>,
+  auto* evalp_list = memory_manager->method_allocator()->allocateList<EValue*>(
       value_indices->size());
+  if (evalp_list == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
+
+  auto* optional_tensor_list =
+      memory_manager->method_allocator()
+          ->allocateList<executorch::aten::optional<T>>(value_indices->size());
+  if (optional_tensor_list == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
 
   size_t output_idx = 0;
   // For each index look up the corresponding EValue (which has been
diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp
index f30d835ccc..23101e5a1c 100644
--- a/runtime/executor/tensor_parser_exec_aten.cpp
+++ b/runtime/executor/tensor_parser_exec_aten.cpp
@@ -74,12 +74,17 @@ ET_NODISCARD Result<BoxedEvalueList<exec_aten::Tensor>> parseTensorList(
     MemoryManager* memory_manager) {
   EXECUTORCH_SCOPE_PROF("TensorParser::parseTensorList");
 
-  auto* tensor_list = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-      memory_manager->method_allocator(),
-      exec_aten::Tensor,
+  auto* tensor_list =
+      memory_manager->method_allocator()->allocateList<exec_aten::Tensor>(
+          tensor_indices->size());
+  if (tensor_list == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
+  auto* evalp_list = memory_manager->method_allocator()->allocateList<EValue*>(
       tensor_indices->size());
-  auto* evalp_list = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-      memory_manager->method_allocator(), EValue*, tensor_indices->size());
+  if (evalp_list == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
 
   // For each tensor index look up the corresponding Tensor (which has been
   // already allocated) and stick it in the list.
diff --git a/runtime/platform/assert.h b/runtime/platform/assert.h
index 3a574334db..f3cd8b7290 100644
--- a/runtime/platform/assert.h
+++ b/runtime/platform/assert.h
@@ -34,12 +34,12 @@
  * @param[in] ... Format string arguments.
  */
 #define ET_CHECK_MSG(_cond, _format, ...)                               \
-  ({                                                                    \
+  do {                                                                  \
     if ET_UNLIKELY (!(_cond)) {                                         \
       ET_ASSERT_MESSAGE_EMIT(" (%s): " _format, #_cond, ##__VA_ARGS__); \
       ::executorch::runtime::runtime_abort();                           \
     }                                                                   \
-  })
+  } while (0)
 
 /**
  * Abort the runtime if the condition is not true.
@@ -48,12 +48,12 @@
  * @param[in] _cond Condition asserted as true.
  */
 #define ET_CHECK(_cond)                       \
-  ({                                          \
+  do {                                        \
     if ET_UNLIKELY (!(_cond)) {               \
       ET_ASSERT_MESSAGE_EMIT(": %s", #_cond); \
       ::executorch::runtime::runtime_abort(); \
     }                                         \
-  })
+  } while (0)
 
 #ifdef NDEBUG
 
@@ -102,10 +102,10 @@
  * Assert that this code location is unreachable during execution.
  */
 #define ET_ASSERT_UNREACHABLE()                                   \
-  ({                                                              \
+  do {                                                            \
     ET_CHECK_MSG(false, "Execution should not reach this point"); \
     ET_UNREACHABLE();                                             \
-  })
+  } while (0)
 
 /**
  * Assert that this code location is unreachable during execution.
diff --git a/runtime/platform/default/posix.cpp b/runtime/platform/default/posix.cpp
index aba504f53e..8807a62516 100644
--- a/runtime/platform/default/posix.cpp
+++ b/runtime/platform/default/posix.cpp
@@ -50,7 +50,7 @@
  * Assert that the PAL has been initialized.
  */
 #define _ASSERT_PAL_INITIALIZED()                                   \
-  ({                                                                \
+  do {                                                              \
     if (!initialized) {                                             \
       fprintf(                                                      \
           ET_LOG_OUTPUT_FILE,                                       \
@@ -59,7 +59,7 @@
       fflush(ET_LOG_OUTPUT_FILE);                                   \
       et_pal_abort();                                               \
     }                                                               \
-  })
+  } while (0)
 
 #endif // NDEBUG
 
diff --git a/runtime/platform/log.h b/runtime/platform/log.h
index 2b1e791b06..9ad234b252 100644
--- a/runtime/platform/log.h
+++ b/runtime/platform/log.h
@@ -155,7 +155,7 @@ using ::executorch::runtime::LogLevel;
  * @param[in] _format Log message format string.
  */
 #define ET_LOG(_level, _format, ...)                                 \
-  ({                                                                 \
+  do {                                                               \
     const auto _log_level = ::executorch::runtime::LogLevel::_level; \
     if (static_cast<uint32_t>(_log_level) >=                         \
         static_cast<uint32_t>(                                       \
@@ -171,8 +171,7 @@ using ::executorch::runtime::LogLevel;
           _format,                                                   \
           ##__VA_ARGS__);                                            \
     }                                                                \
-  })
-
+  } while (0)
 #else // ET_LOG_ENABLED
 
 /**
diff --git a/runtime/platform/profiler.h b/runtime/platform/profiler.h
index 07ffd9c349..d636278139 100644
--- a/runtime/platform/profiler.h
+++ b/runtime/platform/profiler.h
@@ -248,20 +248,29 @@ using ::executorch::runtime::track_allocator;
 
 #else
 
-#define EXECUTORCH_PROFILE_CREATE_BLOCK(name) ({ (void)(name); })
+#define EXECUTORCH_PROFILE_CREATE_BLOCK(name) \
+  do {                                        \
+    (void)(name);                             \
+  } while (0)
 
 #define EXECUTORCH_BEGIN_PROF(name) \
   {}
 
-#define EXECUTORCH_END_PROF(token_id) ({ (void)(token_id); })
+#define EXECUTORCH_END_PROF(token_id) \
+  do {                                \
+    (void)(token_id);                 \
+  } while (0)
 
-#define EXECUTORCH_SCOPE_PROF(name) ({ (void)(name); })
+#define EXECUTORCH_SCOPE_PROF(name) \
+  do {                              \
+    (void)(name);                   \
+  } while (0)
 
 #define EXECUTORCH_PROFILE_INSTRUCTION_SCOPE(chain_idx, instruction_idx) \
-  ({                                                                     \
+  do {                                                                   \
     (void)(chain_idx);                                                   \
     (void)(instruction_idx);                                             \
-  })
+  } while (0)
 
 #define EXECUTORCH_DUMP_PROFILE_RESULTS(prof_result_test) \
   memset(prof_result_test, 0, sizeof(::executorch::runtime::prof_result_t));
@@ -269,16 +278,12 @@ using ::executorch::runtime::track_allocator;
 #define EXECUTORCH_RESET_PROFILE_RESULTS() \
   {}
 
-#define EXECUTORCH_TRACK_ALLOCATOR(name) \
-  ({                                     \
-    (void)(name);                        \
-    -1;                                  \
-  })
+#define EXECUTORCH_TRACK_ALLOCATOR(name) ((void)(name), -1)
 
 #define EXECUTORCH_TRACK_ALLOCATION(id, size) \
-  ({                                          \
+  do {                                        \
     (void)(id);                               \
     (void)(size);                             \
-  })
+  } while (0)
 
 #endif