diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index ec597b9c72..335a8930dc 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-f5b99976adcbb01fd71bd0a39ea15bdac6c9e48a
+6ca9ae4f8693639c395544327f7e362441a58c79
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index 6900013564..7def3cb318 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -44,13 +44,15 @@ install_pip_dependencies() {
 }
 
 fix_conda_ubuntu_libstdcxx() {
+  cat /etc/issue
   # WARNING: This is a HACK from PyTorch core to be able to build PyTorch on 22.04.
-  # The issue still exists with the latest conda 23.10.0-1 at the time of writing
-  # (2023/11/16).
+  # Specifically, ubuntu-20+ all comes lib libstdc++ newer than 3.30+, but anaconda
+  # is stuck with 3.29. So, remove libstdc++6.so.3.29 as installed by
+  # https://anaconda.org/anaconda/libstdcxx-ng/files?version=11.2.0
   #
   # PyTorch sev: https://github.com/pytorch/pytorch/issues/105248
   # Ref: https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_conda.sh
-  if grep -e "[12][82].04.[623]" /etc/issue >/dev/null; then
+  if grep -e "2[02].04." /etc/issue >/dev/null; then
     rm "/opt/conda/envs/py_${PYTHON_VERSION}/lib/libstdc++.so.6"
   fi
 }
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index 04d3307220..5ba8c57cdc 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -19,7 +19,7 @@ install_executorch() {
   which pip
   # Install executorch, this assumes that Executorch is checked out in the
   # current directory
-  pip install . --no-build-isolation
+  pip install . --no-build-isolation -v
   # Just print out the list of packages for debugging
   pip list
 }
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 77725298a2..d93e9e9cef 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -2,3 +2,5 @@
 ciflow_push_tags:
 - ciflow/nightly
 - ciflow/trunk
+- ciflow/binaries
+- ciflow/binaries/all
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index b03136e380..a149fde3aa 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -33,8 +33,11 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         BUILD_TOOL=${{ matrix.build-tool }}
+
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_PYBIND=ON bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python \
+        EXECUTORCH_BUILD_PYBIND=ON \
+        .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
 
         # Run pytest with coverage
         pytest -n auto --cov=./ --cov-report=xml
@@ -59,8 +62,13 @@ jobs:
         BUILD_TOOL=${{ matrix.build-tool }}
 
         bash .ci/scripts/setup-conda.sh
+
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_PYBIND=ON bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python \
+        EXECUTORCH_BUILD_PYBIND=ON \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON" \
+        ${CONDA_RUN} --no-capture-output \
+        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
 
         # Run pytest with coverage
         ${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
diff --git a/.github/workflows/app-build.yml b/.github/workflows/app-build.yml
deleted file mode 100644
index 6a5b02b4fc..0000000000
--- a/.github/workflows/app-build.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Build ExecuTorch demo apps
-
-on:
-  push:
-    branches:
-      - main
-      - release/*
-  pull_request:
-    paths:
-      - .ci/docker/**
-      - .github/workflows/app-build.yml
-      - install_requirements.sh
-      - backends/apple/**
-      - build/build_apple_frameworks.sh
-      - build/test_ios_ci.sh
-      - examples/demo-apps/**
-      - extension/module/**
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  test-demo-ios:
-    name: test-demo-ios
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    with:
-      runner: macos-latest-xlarge
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-        BUILD_TOOL=cmake
-
-        bash .ci/scripts/setup-conda.sh
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
-        # Build and test iOS Demo App
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash build/test_ios_ci.sh
-        popd
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
new file mode 100644
index 0000000000..0a4a06aa70
--- /dev/null
+++ b/.github/workflows/apple.yml
@@ -0,0 +1,92 @@
+name: Apple
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+  pull_request:
+    paths:
+      - .ci/docker/**
+      - .github/workflows/app-build.yml
+      - install_requirements.sh
+      - backends/apple/**
+      - build/build_apple_frameworks.sh
+      - build/create_frameworks.sh
+      - build/test_ios_ci.sh
+      - examples/demo-apps/**
+      - extension/apple/**
+      - extension/module/**
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  test-demo-ios:
+    name: test-demo-ios
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-latest-xlarge
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        WORKSPACE=$(pwd)
+        pushd "${WORKSPACE}/pytorch/executorch"
+        BUILD_TOOL=cmake
+
+        .ci/scripts/setup-conda.sh
+
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+
+        # Build and test iOS Demo App
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        build/test_ios_ci.sh
+
+        popd
+
+  build-frameworks-ios:
+    name: build-frameworks-ios
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-latest-xlarge
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      upload-artifact: executorch.zip
+      timeout: 90
+      script: |
+        WORKSPACE=$(pwd)
+        pushd "${WORKSPACE}/pytorch/executorch"
+        BUILD_TOOL=cmake
+        VERSION="0.1.0"
+        OUTPUT="executorch-${VERSION}"
+
+        .ci/scripts/setup-conda.sh
+
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+
+        # Install CoreML Backend Requirements
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        backends/apple/coreml/scripts/install_requirements.sh
+
+        # Install MPS Backend Requirements
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        backends/apple/mps/install_requirements.sh
+
+        # Build iOS Frameworks
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        build/build_apple_frameworks.sh --output="${OUTPUT}" --coreml --mps --portable --xnnpack
+
+        # Bundle iOS Frameworks
+        cp LICENSE "${OUTPUT}"
+        zip -r "${RUNNER_TEMP}/artifacts/${OUTPUT}.zip" "${OUTPUT}"
+
+        popd
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
new file mode 100644
index 0000000000..a2f86b219f
--- /dev/null
+++ b/.github/workflows/build-wheels-linux.yml
@@ -0,0 +1,57 @@
+# From https://github.com/pytorch/test-infra/wiki/Using-Nova-Reusable-Build-Workflows
+name: Build Linux Wheels
+
+on:
+  pull_request:
+    paths:
+      - build/packaging/**
+      - .github/workflows/build-wheels-linux.yml
+  push:
+    branches:
+      - nightly
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - ciflow/binaries/*
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: disabled
+      with-rocm: disabled
+
+  build:
+    needs: generate-matrix
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/executorch
+            pre-script: build/packaging/pre_build_script.sh
+            post-script: build/packaging/post_build_script.sh
+            smoke-test-script: build/packaging/smoke_test.py
+            package-name: executorch
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
new file mode 100644
index 0000000000..dbc74433ff
--- /dev/null
+++ b/.github/workflows/build-wheels-m1.yml
@@ -0,0 +1,58 @@
+# From https://github.com/pytorch/test-infra/wiki/Using-Nova-Reusable-Build-Workflows
+name: Build M1 Wheels
+
+on:
+  pull_request:
+    paths:
+      - build/packaging/**
+      - .github/workflows/build-wheels-m1.yml
+  push:
+    branches:
+      - nightly
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - ciflow/binaries/*
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: macos-arm64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cuda: disabled
+      with-rocm: disabled
+
+  build:
+    needs: generate-matrix
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/executorch
+            pre-script: build/packaging/pre_build_script.sh
+            post-script: build/packaging/post_build_script.sh
+            smoke-test-script: build/packaging/smoke_test.py
+            package-name: executorch
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      runner-type: macos-m1-stable
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 7e4dba0b84..7f70f0cfef 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -206,7 +206,9 @@ jobs:
 
         # build module for executorch.extension.pybindings.portable_lib
         BUILD_TOOL=${{ matrix.build-tool }}
-        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_PYBIND=ON bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python \
+        EXECUTORCH_BUILD_PYBIND=ON \
+        bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
 
         # see if we can import the module successfully
         python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
diff --git a/.gitmodules b/.gitmodules
index ce839f0b10..e21abf3bae 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -55,3 +55,7 @@
 [submodule "backends/vulkan/third-party/Vulkan-Headers"]
 	path = backends/vulkan/third-party/Vulkan-Headers
 	url = https://github.com/KhronosGroup/Vulkan-Headers
+[submodule "third-party/lm-evaluation-harness"]
+	path = third-party/lm-evaluation-harness
+	url = https://github.com/EleutherAI/lm-evaluation-harness
+	branch = v0.4.1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 185214c275..778b7886cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,7 +98,7 @@ endif()
 # data into sections so they can be properly gc'd. -s: strip symbol.
 # -fno-exceptions -fno-rtti: disables exceptions and runtime type.
 set(CMAKE_CXX_FLAGS_RELEASE
-    "-O2 -ffunction-sections -fdata-sections -fno-exceptions -fno-rtti")
+    "-ffunction-sections -fdata-sections -fno-exceptions -fno-rtti")
 if(NOT APPLE)
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
@@ -206,6 +206,13 @@ else()
   set(CMAKE_TOOLCHAIN_IOS OFF)
 endif()
 
+# Detect if an Android toolchain is set.
+if(CMAKE_TOOLCHAIN_FILE MATCHES ".*android\.toolchain\.cmake$")
+  set(CMAKE_TOOLCHAIN_ANDROID ON)
+else()
+  set(CMAKE_TOOLCHAIN_ANDROID OFF)
+endif()
+
 # EXECUTORCH_BUILD_HOST_TARGETS: Option to control the building of host-only
 # tools like `flatc`, along with example executables like `executor_runner` and
 # libraries that it uses, like `gflags`. Disabling this can be helpful when
@@ -328,11 +335,6 @@ if(EXECUTORCH_BUILD_GTESTS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest)
 endif()
 
-option(EXECUTORCH_BUILD_ANDROID_JNI "Build Android JNI" OFF)
-if(EXECUTORCH_BUILD_ANDROID_JNI)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/android)
-endif()
-
 if(EXECUTORCH_BUILD_SDK)
   set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
       ON
@@ -340,6 +342,10 @@ if(EXECUTORCH_BUILD_SDK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_APPLE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
 endif()
@@ -361,6 +367,12 @@ if(EXECUTORCH_BUILD_VULKAN)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
 endif()
 
+option(EXECUTORCH_BUILD_ANDROID_JNI "Build Android JNI" OFF)
+if(EXECUTORCH_BUILD_ANDROID_JNI)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/models/llama2/runner)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/android)
+endif()
+
 option(EXECUTORCH_BUILD_QNN "Build the backends/qualcomm directory" OFF)
 if(EXECUTORCH_BUILD_QNN)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm)
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index b9cc309ff3..167a16f1ba 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -4,6 +4,10 @@ cmake_minimum_required(VERSION 3.19)
 
 project(executorch_coreml_backend)
 
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
@@ -117,8 +121,3 @@ set(
   TARGET coremldelegate
   APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-receiver-expr"
 )
-
-set_property(
-  TARGET coremldelegate
-  PROPERTY CXX_STANDARD 17
-)
diff --git a/backends/apple/coreml/quantizer/coreml_quantizer.py b/backends/apple/coreml/quantizer/coreml_quantizer.py
new file mode 100644
index 0000000000..ad596f7dfc
--- /dev/null
+++ b/backends/apple/coreml/quantizer/coreml_quantizer.py
@@ -0,0 +1,7 @@
+# Copyright © 2024 Apple Inc. All rights reserved.
+#
+# Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+from coremltools.optimize.torch.quantization._coreml_quantizer import (  # noqa: FLAKE8 F401
+    CoreMLQuantizer,
+)
diff --git a/backends/apple/coreml/runtime/delegate/asset.h b/backends/apple/coreml/runtime/delegate/asset.h
index c838cf8d6d..e6cd371cd7 100644
--- a/backends/apple/coreml/runtime/delegate/asset.h
+++ b/backends/apple/coreml/runtime/delegate/asset.h
@@ -8,6 +8,7 @@
 #import <Foundation/Foundation.h>
 
 #import <numeric>
+#import <optional>
 #import <string>
 #import <vector>
 
diff --git a/backends/apple/coreml/scripts/generate_test_models.sh b/backends/apple/coreml/scripts/generate_test_models.sh
index be71b56e31..7beca63726 100755
--- a/backends/apple/coreml/scripts/generate_test_models.sh
+++ b/backends/apple/coreml/scripts/generate_test_models.sh
@@ -23,8 +23,8 @@ cd "$EXECUTORCH_ROOT_PATH"
 MODELS=("add" "mul" "mv3")
 for MODEL in "${MODELS[@]}"
 do
-  # TODO: Don't use the script in examples directory. 
-  python3 -m examples.apple.coreml.scripts.export_and_delegate --model_name "$MODEL" --save_processed_bytes
+  # TODO: Don't use the script in examples directory.
+  python3 -m examples.apple.coreml.scripts.export --model_name "$MODEL" --save_processed_bytes
   mv -f "$MODEL""_coreml_all.pte" "$COREML_DIR_PATH/runtime/test/models"
   mv -f "$MODEL""_coreml_all.bin" "$COREML_DIR_PATH/runtime/test/models"
 done
diff --git a/backends/apple/coreml/setup.md b/backends/apple/coreml/setup.md
index 86afc50f00..c01f6e2d23 100644
--- a/backends/apple/coreml/setup.md
+++ b/backends/apple/coreml/setup.md
@@ -11,18 +11,18 @@ This is a tutorial for setting up the Core ML backend.
 ```
 cd executorch
 
-./backends/apple/coreml/scripts/install_requirements.sh   
+./backends/apple/coreml/scripts/install_requirements.sh
 
-``` 
+```
 
-3. Run the example script to validate that the **Core ML** backend is set up correctly. 
+3. Run the example script to validate that the **Core ML** backend is set up correctly.
 
 ```
 cd executorch
 
 # Saves add_coreml_all.pte in the current directory if successful.
 
-python3 -m examples.apple.coreml.scripts.export_and_delegate --model_name add 
+python3 -m examples.apple.coreml.scripts.export --model_name add
 
 ```
 
@@ -66,6 +66,6 @@ coreml_backend.xcframework
 - Accelerate.framework
 - CoreML.framework
 - libsqlite3.tbd
-``` 
+```
 
-6. The target could now run a **Core ML** delegated **Program**. 
+6. The target could now run a **Core ML** delegated **Program**.
diff --git a/backends/apple/coreml/test/test_coreml_quantizer.py b/backends/apple/coreml/test/test_coreml_quantizer.py
new file mode 100644
index 0000000000..67eee3593f
--- /dev/null
+++ b/backends/apple/coreml/test/test_coreml_quantizer.py
@@ -0,0 +1,112 @@
+# Copyright © 2024 Apple Inc. All rights reserved.
+#
+# Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+from typing import Tuple
+
+import numpy as np
+import pytest
+
+import torch
+
+from coremltools.optimize.torch.quantization.quantization_config import (
+    LinearQuantizerConfig,
+    QuantizationScheme,
+)
+
+from executorch.backends.apple.coreml.quantizer.coreml_quantizer import CoreMLQuantizer
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization.quantize_pt2e import (
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
+)
+
+
+class TestCoreMLQuantizer:
+    @staticmethod
+    def quantize_and_compare(
+        model,
+        example_inputs: Tuple[torch.Tensor],
+        quantization_type: str,
+    ) -> None:
+        assert quantization_type in {"PTQ", "QAT"}
+
+        pre_autograd_aten_dialect = capture_pre_autograd_graph(model, example_inputs)
+
+        quantization_config = LinearQuantizerConfig.from_dict(
+            {
+                "global_config": {
+                    "quantization_scheme": QuantizationScheme.symmetric,
+                    "milestones": [0, 0, 10, 10],
+                    "activation_dtype": torch.quint8,
+                    "weight_dtype": torch.qint8,
+                    "weight_per_channel": True,
+                }
+            }
+        )
+        quantizer = CoreMLQuantizer(quantization_config)
+
+        if quantization_type == "PTQ":
+            prepared_graph = prepare_pt2e(pre_autograd_aten_dialect, quantizer)
+        elif quantization_type == "QAT":
+            prepared_graph = prepare_qat_pt2e(pre_autograd_aten_dialect, quantizer)
+
+        prepared_graph(*example_inputs)
+        converted_graph = convert_pt2e(prepared_graph)
+
+        model_output = model(*example_inputs).detach().numpy()
+        quantized_output = converted_graph(*example_inputs).detach().numpy()
+        np.testing.assert_allclose(quantized_output, model_output, rtol=5e-2, atol=5e-2)
+
+    @pytest.mark.parametrize("quantization_type", ("PTQ", "QAT"))
+    def test_conv_relu(self, quantization_type):
+        SHAPE = (1, 3, 256, 256)
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channels=3, out_channels=16, kernel_size=3, padding=1
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                a = self.conv(x)
+                return self.relu(a)
+
+        model = Model()
+
+        example_inputs = (torch.randn(SHAPE),)
+        self.quantize_and_compare(
+            model,
+            example_inputs,
+            quantization_type,
+        )
+
+    @pytest.mark.parametrize("quantization_type", ("PTQ", "QAT"))
+    def test_linear(self, quantization_type):
+        SHAPE = (1, 5)
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 10)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.linear(x)
+
+        model = Model()
+
+        example_inputs = (torch.randn(SHAPE),)
+        self.quantize_and_compare(
+            model,
+            example_inputs,
+            quantization_type,
+        )
+
+
+if __name__ == "__main__":
+    test_runner = TestCoreMLQuantizer()
+    test_runner.test_conv_relu("PTQ")
+    test_runner.test_linear("QAT")
diff --git a/backends/arm/arm_quantizer.py b/backends/arm/arm_quantizer.py
new file mode 100644
index 0000000000..1062c62428
--- /dev/null
+++ b/backends/arm/arm_quantizer.py
@@ -0,0 +1,453 @@
+from __future__ import annotations
+
+import copy
+import functools
+
+from typing import Any, Callable, Dict, List, Optional, Set
+
+import torch
+import torch._dynamo as torchdynamo
+import torch.nn.functional as F
+from torch.ao.quantization.fake_quantize import (
+    FakeQuantize,
+    FusedMovingAvgObsFakeQuantize,
+)
+from torch.ao.quantization.observer import (
+    HistogramObserver,
+    MinMaxObserver,
+    MovingAverageMinMaxObserver,
+    MovingAveragePerChannelMinMaxObserver,
+    PerChannelMinMaxObserver,
+    PlaceholderObserver,
+)
+
+from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+
+from torch.ao.quantization.quantizer import QuantizationSpec, Quantizer
+
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    _convert_scalars_to_attrs,
+    OP_TO_ANNOTATOR,
+    OperatorConfig,
+    OperatorPatternType,
+    propagate_annotation,
+    QuantizationConfig,
+)
+
+from torch.fx import Node
+
+
+__all__ = [
+    "XNNPACKQuantizer",
+    "get_symmetric_quantization_config",
+]
+
+
+def _get_dynamo_graph(function: Callable, inputs) -> torch.fx.Graph:
+    gm, _ = torchdynamo.export(function, aten_graph=True)(*inputs)
+    gm.graph.eliminate_dead_code()
+    return gm.graph
+
+
+def _get_linear_patterns(input_size: List[int]):
+    in_channels = input_size[-1]
+    out_channels = 8  # hard coding but this should not matter
+    weight = torch.ones((out_channels, in_channels))
+    bias = torch.ones((out_channels,))
+    act = torch.ones(input_size)
+
+    def linear_op(act, weight, bias=None):
+        return F.linear(act, weight, bias)
+
+    pattern_w_bias = _get_dynamo_graph(linear_op, (act, weight, bias))
+    pattern_wo_bias = _get_dynamo_graph(linear_op, (act, weight))
+    return [pattern_w_bias, pattern_wo_bias]
+
+
+def _supported_symmetric_quantized_operators() -> Dict[str, List[OperatorPatternType]]:
+    supported_operators: Dict[str, List[OperatorPatternType]] = {
+        # Both conv and linear should be able to handle relu + hardtanh fusion since
+        # those are clamp ops
+        "conv2d": [
+            [torch.nn.Conv2d, torch.nn.ReLU],
+            [torch.nn.Conv2d, F.relu],
+            [F.conv2d, torch.nn.ReLU],
+            [F.conv2d, F.relu],
+        ],
+        "linear": [[torch.nn.Linear], [F.linear]],
+        "add": [[torch.add]],
+        "max_pool2d": [[torch.nn.MaxPool2d], [F.max_pool2d]],
+        "adaptive_avg_pool2d": [
+            [torch.nn.AdaptiveAvgPool2d],
+            [F.adaptive_avg_pool2d],
+        ],
+    }
+    return copy.deepcopy(supported_operators)
+
+
+def _get_supported_symmetric_config_and_operators() -> List[OperatorConfig]:
+    supported_config_and_operators: List[OperatorConfig] = []
+    for quantization_config in [
+        get_symmetric_quantization_config(),
+        get_symmetric_quantization_config(is_qat=True),
+        get_symmetric_quantization_config(is_per_channel=True),
+        get_symmetric_quantization_config(is_per_channel=True, is_qat=True),
+    ]:
+        ops = _supported_symmetric_quantized_operators()
+        for pattern_list in ops.values():
+            supported_config_and_operators.append(
+                OperatorConfig(quantization_config, pattern_list)
+            )
+    return copy.deepcopy(supported_config_and_operators)
+
+
+@functools.lru_cache
+def get_symmetric_quantization_config(
+    is_per_channel: bool = False,
+    is_qat: bool = False,
+    is_dynamic: bool = False,
+    act_qmin: int = -128,
+    act_qmax: int = 127,
+    weight_qmin: int = -127,
+    weight_qmax: int = 127,
+):
+    extra_args: Dict[str, Any] = {"eps": 2**-12}
+    if is_qat:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = FakeQuantize
+            dynamic_quant_observer = MovingAverageMinMaxObserver.with_args(
+                averaging_constant=1
+            )
+            extra_args["observer"] = dynamic_quant_observer
+        else:
+            act_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize  # type: ignore[assignment]
+    else:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = PlaceholderObserver  # type: ignore[assignment]
+        else:
+            act_observer_or_fake_quant_ctr = HistogramObserver  # type: ignore[assignment]
+
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=act_qmin,
+        quant_max=act_qmax,
+        qscheme=torch.per_tensor_affine,
+        is_dynamic=is_dynamic,
+        observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
+            **extra_args,
+        ),
+    )
+    weight_qscheme = (
+        torch.per_channel_symmetric if is_per_channel else torch.per_tensor_symmetric
+    )
+    weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = (
+        MinMaxObserver
+    )
+    if is_qat:
+        # TODO: qat + per channel?
+        weight_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize
+    elif is_per_channel:
+        weight_observer_or_fake_quant_ctr = PerChannelMinMaxObserver
+
+    extra_args: Dict[str, Any] = {"eps": 2**-12}
+    if is_qat:
+        if weight_qscheme == torch.per_tensor_symmetric:
+            extra_args["observer"] = MovingAverageMinMaxObserver
+        else:
+            extra_args["observer"] = MovingAveragePerChannelMinMaxObserver  # type: ignore[dict-item]
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=weight_qmin,
+        quant_max=weight_qmax,
+        qscheme=weight_qscheme,
+        ch_axis=0,
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+
+    bias_quantization_spec = None
+    if is_dynamic:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,
+            None,
+            weight_quantization_spec,
+            bias_quantization_spec,
+            is_qat,
+        )
+    else:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,
+            act_quantization_spec,
+            weight_quantization_spec,
+            bias_quantization_spec,
+            is_qat,
+        )
+    return quantization_config
+
+
+def _get_supported_config_and_operators() -> List[OperatorConfig]:
+    return _get_supported_symmetric_config_and_operators()
+
+
+def _get_module_name_filter(module_name: str):
+    """Get the module_name_filter function for a given module name, the filter accepts
+    a node and checks if the node comes from a module that has certain module name
+
+    For example:
+        node: linear_op = call_function[...](...)  # comes from a module with name blocks.sub.linear1
+
+
+    >> module_name_filter = _get_module_name_filter("blocks.sub")
+    >> print(module_name_filter(node))
+    True  # the node is from "blocks.sub" based on the fully qualified name "blocks.sub.linear1"
+    """
+
+    def module_name_filter(n: Node) -> bool:
+        # example: {
+        #    'L__self___sub': ("L['self'].sub", <class '....Sub'>),
+        #    'L__self___sub_linear': ("L['self'].sub.linear", <class 'torch.nn.modules.linear.Linear'>)
+        # }
+        # get_attr nodes doesn't have nn_module_stack?
+        nn_module_stack = n.meta.get("nn_module_stack", {})
+        names = [n[len("L['self'].") :] for n, klass in nn_module_stack.values()]
+        return module_name in names
+
+    return module_name_filter
+
+
+def _get_module_type_filter(tp: Callable):
+    """Get the module_type_filter function for a given module type, the filter accepts
+    a node and checks if the node comes from a module that has certain module type
+
+    For example:
+        node: linear_op = call_function[...](...)  # comes from a module with type Block -> Sub -> Linear
+
+
+    >> module_type_filter = _get_module_type_filter(Sub)  # submodule with type `Sub`, under the `Block` submodule
+    >> print(module_type_filter(node))
+    True  # the node is from the submodule `Sub` (same for `Block` and `Linear` as well)
+    """
+
+    def module_type_filter(n: Node) -> bool:
+        # example: {
+        #     'L__self___sub': ("L['self'].sub", <class '....Sub'>),
+        #     'L__self___sub_linear': ("L['self'].sub.linear", <class 'torch.nn.modules.linear.Linear'>)
+        # }
+        nn_module_stack = n.meta.get("nn_module_stack", {})
+        types = [t for _, t in nn_module_stack.values()]
+        return tp in types
+
+    return module_type_filter
+
+
+def _get_not_module_type_or_name_filter(
+    tp_list: List[Callable], module_name_list: List[str]
+) -> Callable[[Node], bool]:
+    module_type_filters = [_get_module_type_filter(tp) for tp in tp_list]
+    module_name_list_filters = [_get_module_name_filter(m) for m in module_name_list]
+
+    def not_module_type_or_name_filter(n: Node) -> bool:
+        return not any(f(n) for f in module_type_filters + module_name_list_filters)
+
+    return not_module_type_or_name_filter
+
+
+class XNNPACKQuantizer(Quantizer):
+    supported_config_and_operators = _get_supported_config_and_operators()
+    STATIC_QAT_ONLY_OPS = [
+        "conv_bn_relu",
+        "conv_bn",
+    ]
+
+    # static quantization ops (both PTQ and QAT)
+    # Preserve the order that fusions come before singular ops
+    STATIC_OPS = [
+        "linear_relu",
+        "linear",
+        "conv_relu",
+        "conv",
+        "adaptive_avg_pool2d",
+        # TODO: move this to BoltNNQuantizer?
+        "gru_io_only",
+        "max_pool2d",
+        "add_relu",
+        "add",
+        "mul_relu",
+        "mul",
+        "cat",
+    ]
+
+    DYNAMIC_OPS = [
+        "linear",
+    ]
+
+    def __init__(self):
+        super().__init__()
+        self.global_config: Optional[QuantizationConfig] = None
+        self.operator_type_config: Dict[
+            torch._ops.OpOverloadPacket, Optional[QuantizationConfig]
+        ] = {}
+        self.module_type_config: Dict[Callable, Optional[QuantizationConfig]] = {}
+        self.module_name_config: Dict[str, Optional[QuantizationConfig]] = {}
+
+    @classmethod
+    def get_supported_quantization_configs(cls) -> List[QuantizationConfig]:
+        op_configs: Set[QuantizationConfig] = set({})
+        for spec, _ in cls.supported_config_and_operators:
+            op_configs.add(spec)
+        return list(op_configs)
+
+    @classmethod
+    def get_supported_operator_for_quantization_config(
+        cls, quantization_config: Optional[QuantizationConfig]
+    ) -> List[OperatorPatternType]:
+        if quantization_config is None:
+            all_ops = []
+            for _, ops in cls.supported_config_and_operators:
+                all_ops.extend(ops)
+            return all_ops
+
+        for config, ops in cls.supported_config_and_operators:
+            # note: this assumes each entry in cls.supported_spec_and_operators
+            # corresponds to one spec, e.g. we don't have
+            # [(spec1, op_list1), (spec1, op_list2), (spec2, op_list3)]
+            # where the first and second entry have the same spec but did not
+            # merge the op list
+            if config == quantization_config:
+                return ops
+        return []
+
+    def set_global(self, quantization_config: QuantizationConfig) -> XNNPACKQuantizer:
+        self.global_config = quantization_config
+        return self
+
+    def set_operator_type(
+        self,
+        operator_type: torch._ops.OpOverloadPacket,
+        quantization_config: QuantizationConfig,
+    ) -> XNNPACKQuantizer:
+        self.operator_type_config[operator_type] = quantization_config
+        return self
+
+    def set_module_type(
+        self, module_type: Callable, quantization_config: QuantizationConfig
+    ):
+        """Set quantization_config for a submodule with type: `module_type`, for example:
+        quantizer.set_module_name(Sub) or quantizer.set_module_name(nn.Linear), it will quantize all supported operator/operator
+        patterns in the submodule with this module type with the given `quantization_config`
+        """
+        self.module_type_config[module_type] = quantization_config
+        return self
+
+    def set_module_name(
+        self, module_name: str, quantization_config: Optional[QuantizationConfig]
+    ):
+        """Set quantization_config for a submodule with name: `module_name`, for example:
+        quantizer.set_module_name("blocks.sub"), it will quantize all supported operator/operator
+        patterns in the submodule with this module name with the given `quantization_config`
+        """
+        assert (
+            quantization_config is not None
+        ), " quantization_config == None is not supported yet"
+        self.module_name_config[module_name] = quantization_config
+        return self
+
+    def transform_for_annotation(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        """Transforms scalar values to tensor attributes"""
+        return _convert_scalars_to_attrs(model)
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """just handling global spec for now"""
+        # hacked for handling dynamic linear quant. will fix later.
+        if self.global_config and self.global_config.input_activation.is_dynamic:  # type: ignore[union-attr]
+            model = self._annotate_for_dynamic_quantization_config(model)
+        else:
+            model = self._annotate_for_static_quantization_config(model)
+        propagate_annotation(model)
+        return model
+
+    def _annotate_all_static_patterns(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: Optional[QuantizationConfig],
+        filter_fn: Optional[Callable[[Node], bool]] = None,
+    ) -> torch.fx.GraphModule:
+        # TODO: implement the support for None to be canceling out previous annotations
+        if quantization_config is None:
+            return model
+
+        if quantization_config.is_qat:
+            for op in self.STATIC_QAT_ONLY_OPS:
+                OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
+        for op in self.STATIC_OPS:
+            OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
+        return model
+
+    def _annotate_all_dynamic_patterns(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: Optional[QuantizationConfig],
+        filter_fn: Optional[Callable[[Node], bool]] = None,
+    ) -> torch.fx.GraphModule:
+        # TODO: implement the support for None to be canceling out previous annotations
+        if quantization_config is None:
+            return model
+
+        for op in self.DYNAMIC_OPS:
+            OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
+        return model
+
+    def _annotate_for_static_quantization_config(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        module_name_list = list(self.module_name_config.keys())
+        for module_name, config in self.module_name_config.items():
+            self._annotate_all_static_patterns(
+                model, config, _get_module_name_filter(module_name)
+            )
+
+        tp_list = list(self.module_type_config.keys())
+        for module_type, config in self.module_type_config.items():
+            self._annotate_all_static_patterns(
+                model, config, _get_module_type_filter(module_type)
+            )
+
+        self._annotate_all_static_patterns(
+            model,
+            self.global_config,
+            _get_not_module_type_or_name_filter(tp_list, module_name_list),
+        )
+        return model
+
+    def _annotate_for_dynamic_quantization_config(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        module_name_list = list(self.module_name_config.keys())
+        for module_name, config in self.module_name_config.items():
+            self._annotate_all_dynamic_patterns(
+                model, config, _get_module_name_filter(module_name)
+            )
+
+        tp_list = list(self.module_type_config.keys())
+        for module_type, config in self.module_type_config.items():
+            self._annotate_all_dynamic_patterns(
+                model, config, _get_module_type_filter(module_type)
+            )
+
+        self._annotate_all_dynamic_patterns(
+            model,
+            self.global_config,
+            _get_not_module_type_or_name_filter(tp_list, module_name_list),
+        )
+        return model
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
+
+    @classmethod
+    def get_supported_operators(cls) -> List[OperatorConfig]:
+        return cls.supported_config_and_operators
diff --git a/backends/arm/arm_quantizer_utils.py b/backends/arm/arm_quantizer_utils.py
new file mode 100644
index 0000000000..7316e0fbad
--- /dev/null
+++ b/backends/arm/arm_quantizer_utils.py
@@ -0,0 +1,1031 @@
+import itertools
+import operator
+from dataclasses import dataclass
+from typing import Callable, Dict, List, NamedTuple, Optional
+
+import torch
+import torch.nn.functional as F
+from torch._subclasses import FakeTensor
+from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
+from torch.ao.quantization.pt2e.export_utils import _WrapperModule
+from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
+from torch.ao.quantization.pt2e.utils import (
+    _conv1d_bn_example_inputs,
+    _conv2d_bn_example_inputs,
+    get_aten_graph_module,
+)
+from torch.ao.quantization.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpec,
+    SharedQuantizationSpec,
+)
+
+from torch.ao.quantization.quantizer.utils import (
+    _annotate_input_qspec_map,
+    _annotate_output_qspec,
+)
+from torch.fx import Node
+from torch.fx.passes.utils.matcher_with_name_node_map_utils import (
+    SubgraphMatcherWithNameNodeMap,
+)
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+__all__ = [
+    "OperatorConfig",
+    "OperatorPatternType",
+    "QuantizationConfig",
+    "get_input_act_qspec",
+    "get_output_act_qspec",
+    "get_weight_qspec",
+    "get_bias_qspec",
+    "OP_TO_ANNOTATOR",
+    "propagate_annotation",
+]
+
+
+# In the absence of better name, just winging it with QuantizationConfig
+@dataclass(eq=True, frozen=True)
+class QuantizationConfig:
+    input_activation: Optional[QuantizationSpec]
+    output_activation: Optional[QuantizationSpec]
+    weight: Optional[QuantizationSpec]
+    bias: Optional[QuantizationSpec]
+    # TODO: remove, since we can use observer_or_fake_quant_ctr to express this
+    is_qat: bool = False
+
+
+OperatorPatternType = List[Callable]
+OperatorPatternType.__module__ = (
+    "torch.ao.quantization.quantizer.xnnpack_quantizer_utils"
+)
+
+AnnotatorType = Callable[
+    [
+        torch.fx.GraphModule,
+        Optional[QuantizationConfig],
+        Optional[Callable[[Node], bool]],
+    ],
+    Optional[List[List[Node]]],
+]
+OP_TO_ANNOTATOR: Dict[str, AnnotatorType] = {}
+
+
+def register_annotator(op: str):
+    def decorator(annotator: AnnotatorType):
+        OP_TO_ANNOTATOR[op] = annotator
+
+    return decorator
+
+
+class OperatorConfig(NamedTuple):
+    # fix List[str] with List[List[Union[nn.Module, FunctionType, BuiltinFunctionType]]]
+    # Basically we are mapping a quantization config to some list of patterns.
+    # a pattern is defined as a list of nn module, function or builtin function names
+    # e.g. [nn.Conv2d, torch.relu, torch.add]
+    # We have not resolved whether fusion can be considered internal details of the
+    # quantizer hence it does not need communication to user.
+    # Note this pattern is not really informative since it does not really
+    # tell us the graph structure resulting from the list of ops.
+    config: QuantizationConfig
+    operators: List[OperatorPatternType]
+
+
+def _is_annotated(nodes: List[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    check if any of the node is annotated, return True if any of the node
+    is annotated, otherwise return False
+    """
+    annotated = False
+    for node in nodes:
+        annotated = annotated or (
+            "quantization_annotation" in node.meta
+            and node.meta["quantization_annotation"]._annotated
+        )
+    return annotated
+
+
+def _mark_nodes_as_annotated(nodes: List[Node]):
+    for node in nodes:
+        if node is not None:
+            if "quantization_annotation" not in node.meta:
+                node.meta["quantization_annotation"] = QuantizationAnnotation()
+            node.meta["quantization_annotation"]._annotated = True
+
+
+def get_input_act_qspec(quantization_config: Optional[QuantizationConfig]):
+    if quantization_config is None:
+        return None
+    if quantization_config.input_activation is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.input_activation
+    assert quantization_spec.qscheme in [
+        torch.per_tensor_affine,
+        torch.per_tensor_symmetric,
+    ]
+    return quantization_spec
+
+
+def get_output_act_qspec(quantization_config: Optional[QuantizationConfig]):
+    if quantization_config is None:
+        return None
+    if quantization_config.output_activation is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.output_activation
+    assert quantization_spec.qscheme in [
+        torch.per_tensor_affine,
+        torch.per_tensor_symmetric,
+    ]
+    return quantization_spec
+
+
+def get_weight_qspec(quantization_config: Optional[QuantizationConfig]):
+    if quantization_config is None:
+        return None
+    assert quantization_config is not None
+    if quantization_config.weight is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.weight
+    if quantization_spec.qscheme not in [
+        torch.per_tensor_symmetric,
+        torch.per_channel_symmetric,
+    ]:
+        raise ValueError(
+            f"Unsupported quantization_spec {quantization_spec} for weight"
+        )
+    return quantization_spec
+
+
+def get_bias_qspec(quantization_config: Optional[QuantizationConfig]):
+    if quantization_config is None:
+        return None
+    assert quantization_config is not None
+    if quantization_config.bias is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.bias
+    assert (
+        quantization_spec.dtype == torch.float
+    ), "Only float dtype for bias is supported for bias right now"
+    return quantization_spec
+
+
+@register_annotator("linear")
+def _annotate_linear(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    annotated_partitions = []
+    input_act_qspec = get_input_act_qspec(quantization_config)
+    output_act_qspec = get_output_act_qspec(quantization_config)
+    weight_qspec = get_weight_qspec(quantization_config)
+    bias_qspec = get_bias_qspec(quantization_config)
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target != torch.ops.aten.linear.default:
+            continue
+        if filter_fn and not filter_fn(node):
+            continue
+        act_node = node.args[0]
+        weight_node = node.args[1]
+        bias_node = None
+        if len(node.args) > 2:
+            bias_node = node.args[2]
+
+        if _is_annotated([node]) is False:  # type: ignore[list-item]
+            _annotate_input_qspec_map(
+                node,
+                act_node,
+                input_act_qspec,
+            )
+            _annotate_input_qspec_map(
+                node,
+                weight_node,
+                weight_qspec,
+            )
+            nodes_to_mark_annotated = [node, weight_node]
+            if bias_node:
+                _annotate_input_qspec_map(
+                    node,
+                    bias_node,
+                    bias_qspec,
+                )
+                nodes_to_mark_annotated.append(bias_node)
+            _annotate_output_qspec(node, output_act_qspec)
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+            annotated_partitions.append(nodes_to_mark_annotated)
+
+    return annotated_partitions
+
+
+@register_annotator("linear_relu")
+def _annotate_linear_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    annotated_partitions = []
+    input_act_qspec = get_input_act_qspec(quantization_config)
+    output_act_qspec = get_output_act_qspec(quantization_config)
+    weight_qspec = get_weight_qspec(quantization_config)
+    bias_qspec = get_bias_qspec(quantization_config)
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target not in [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]:
+            continue
+        relu_node = node
+        maybe_linear_node = node.args[0]
+        if (
+            not isinstance(maybe_linear_node, Node)
+            or maybe_linear_node.op != "call_function"
+            or maybe_linear_node.target != torch.ops.aten.linear.default
+        ):
+            continue
+
+        linear_node = maybe_linear_node
+        input_qspec_map = {}
+        input_act = linear_node.args[0]
+        assert isinstance(input_act, Node)
+        input_qspec_map[input_act] = input_act_qspec
+
+        weight = linear_node.args[1]
+        assert isinstance(weight, Node)
+        input_qspec_map[weight] = weight_qspec
+
+        # adding weight node to the partition as well
+        partition = [relu_node, linear_node, weight]
+        bias = linear_node.args[2] if len(linear_node.args) > 2 else None
+        if isinstance(bias, Node):
+            input_qspec_map[bias] = bias_qspec
+            partition.append(bias)
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        linear_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("conv")
+def _annotate_conv(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    annotated_partitions = []
+    for n in gm.graph.nodes:
+        if n.op != "call_function" or n.target not in [
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+        ]:
+            continue
+        conv_node = n
+
+        input_qspec_map = {}
+        input_act = conv_node.args[0]
+        assert isinstance(input_act, Node)
+        input_qspec_map[input_act] = get_input_act_qspec(quantization_config)
+
+        weight = conv_node.args[1]
+        assert isinstance(weight, Node)
+        input_qspec_map[weight] = get_weight_qspec(quantization_config)
+
+        # adding weight node to the partition as well
+        partition = [conv_node, conv_node.args[1]]
+
+        bias = conv_node.args[2] if len(conv_node.args) > 2 else None
+        if isinstance(bias, Node):
+            input_qspec_map[bias] = get_bias_qspec(quantization_config)
+            partition.append(bias)
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=get_output_act_qspec(quantization_config),
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("conv_relu")
+def _annotate_conv_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    annotated_partitions = []
+    for n in gm.graph.nodes:
+        if n.op != "call_function" or n.target not in [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]:
+            continue
+        relu_node = n
+        maybe_conv_node = n.args[0]
+        if (
+            not isinstance(maybe_conv_node, Node)
+            or maybe_conv_node.op != "call_function"
+            or maybe_conv_node.target
+            not in [
+                torch.ops.aten.conv1d.default,
+                torch.ops.aten.conv2d.default,
+            ]
+        ):
+            continue
+        conv_node = maybe_conv_node
+
+        input_qspec_map = {}
+        input_act = conv_node.args[0]
+        assert isinstance(input_act, Node)
+        input_qspec_map[input_act] = get_input_act_qspec(quantization_config)
+
+        weight = conv_node.args[1]
+        assert isinstance(weight, Node)
+        input_qspec_map[weight] = get_weight_qspec(quantization_config)
+
+        # adding weight node to the partition as well
+        partition = [relu_node, conv_node, conv_node.args[1]]
+        bias = conv_node.args[2] if len(conv_node.args) > 2 else None
+        if isinstance(bias, Node):
+            input_qspec_map[bias] = get_bias_qspec(quantization_config)
+            partition.append(bias)
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map, _annotated=True
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("conv_bn")
+def _annotate_conv_bn(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """
+    Find conv + batchnorm parititions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=False)
+
+
+@register_annotator("conv_bn_relu")
+def _annotate_conv_bn_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """
+    Find conv + batchnorm + relu parititions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=True)
+
+
+def _get_pattern(conv_fn: Callable, relu_is_inplace: bool, has_relu: bool):
+    def _conv_bn(x, conv_weight, conv_bias, bn_weight, bn_bias, bn_rm, bn_rv):
+        conv = conv_fn(x, conv_weight, conv_bias)
+        bn = F.batch_norm(conv, bn_rm, bn_rv, bn_weight, bn_bias, training=True)
+        if has_relu:
+            output = F.relu_(bn) if relu_is_inplace else F.relu(bn)
+        else:
+            output = bn
+        return output, {
+            "input": x,
+            "conv": conv,
+            "weight": conv_weight,
+            "bias": conv_bias,
+            "output": output,
+        }
+
+    return _WrapperModule(_conv_bn)
+
+
+def _do_annotate_conv_bn(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]],
+    has_relu: bool,
+) -> List[List[Node]]:
+    """
+    Given a function that takes in a `conv_fn` and returns a conv-bn[-relu] pattern,
+    return a list of annotated partitions.
+
+    The output of the pattern must include a dictionary from string name to node
+    for the following names: "input", "conv", "weight", "bias", and "output".
+    """
+
+    # Needed for matching, otherwise the matches gets filtered out due to unused
+    # nodes returned by batch norm
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+
+    matches = []
+    combinations = [
+        (F.conv1d, _conv1d_bn_example_inputs),
+        (F.conv2d, _conv2d_bn_example_inputs),
+    ]
+
+    # Add `is_cuda` and `relu_is_inplace` dimensions
+    combinations = itertools.product(
+        combinations,
+        [True, False] if torch.cuda.is_available() else [False],  # is_cuda
+        [True, False] if has_relu else [False],  # relu_is_inplace
+    )
+
+    # Match against all conv dimensions and cuda variants
+    for (conv_fn, example_inputs), is_cuda, relu_is_inplace in combinations:
+        pattern = _get_pattern(conv_fn, relu_is_inplace, has_relu)
+        pattern = get_aten_graph_module(pattern, example_inputs, is_cuda)
+        pattern.graph.eliminate_dead_code()
+        pattern.recompile()
+        matcher = SubgraphMatcherWithNameNodeMap(pattern, ignore_literals=True)
+        matches.extend(matcher.match(gm.graph))
+
+    # Annotate nodes returned in the matches
+    annotated_partitions = []
+    for match in matches:
+        name_node_map = match.name_node_map
+        input_node = name_node_map["input"]
+        conv_node = name_node_map["conv"]
+        weight_node = name_node_map["weight"]
+        bias_node = name_node_map["bias"]
+        output_node = name_node_map["output"]
+
+        # TODO: annotate the uses of input, weight, and bias separately instead
+        # of assuming they come from a single conv node. This is not possible today
+        # because input may have multiple users, and we can't rely on the conv node
+        # always being the first user. This was the case in models with skip
+        # connections like resnet18
+
+        # Validate conv args
+        if conv_node.args[0] is not input_node:
+            raise ValueError("Conv arg did not contain input node ", input_node)
+        if conv_node.args[1] is not weight_node:
+            raise ValueError("Conv arg did not contain weight node ", weight_node)
+        if len(conv_node.args) > 2 and conv_node.args[2] is not bias_node:
+            raise ValueError("Conv arg did not contain bias node ", bias_node)
+
+        # Skip if the partition is already annotated or is filtered out by the user
+        partition = [conv_node, weight_node]
+        if bias_node is not None:
+            partition.append(bias_node)
+        if _is_annotated(partition):
+            continue
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        # Annotate conv inputs and pattern output
+        input_qspec_map = {}
+        input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+        input_qspec_map[weight_node] = get_weight_qspec(quantization_config)
+        if bias_node is not None:
+            input_qspec_map[bias_node] = get_bias_qspec(quantization_config)
+        conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        output_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("gru_io_only")
+def _annotate_gru_io_only(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    gru_partitions = get_source_partitions(gm.graph, [torch.nn.GRU], filter_fn)
+    gru_partitions = list(itertools.chain.from_iterable(gru_partitions.values()))
+    annotated_partitions = []
+    for gru_partition in gru_partitions:
+        annotated_partitions.append(gru_partition.nodes)
+        output_nodes = gru_partition.output_nodes
+        input_nodes = gru_partition.input_nodes
+        # skip annotation if it is already annotated
+        if _is_annotated(input_nodes + output_nodes):
+            continue
+        # inside each GRU partition, we should be able to annotate each linear
+        # subgraph
+        input_act = input_nodes[0]
+        input_act_user = next(iter(input_act.users.keys()))
+        assert isinstance(input_act, Node)
+        assert isinstance(input_act_user, Node)
+        input_act_user.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                input_act: get_input_act_qspec(quantization_config),
+            },
+            _annotated=True,
+        )
+
+        hidden_state = input_nodes[1]
+        hidden_state_user = next(iter(hidden_state.users.keys()))
+        assert isinstance(hidden_state, Node)
+        assert isinstance(hidden_state_user, Node)
+        hidden_state_user.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                hidden_state: get_input_act_qspec(quantization_config),
+            },
+            _annotated=True,
+        )
+
+        assert len(output_nodes) == 2, "expecting GRU to have two outputs"
+        for output in output_nodes:
+            output.meta["quantization_annotation"] = QuantizationAnnotation(
+                output_qspec=get_output_act_qspec(quantization_config),
+                _annotated=True,
+            )
+        nodes_to_mark_annotated = list(gru_partition.nodes)
+        _mark_nodes_as_annotated(nodes_to_mark_annotated)
+    return annotated_partitions
+
+
+@register_annotator("max_pool2d")
+def _annotate_max_pool2d(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    module_partitions = get_source_partitions(
+        gm.graph, [torch.nn.MaxPool2d, torch.nn.functional.max_pool2d], filter_fn
+    )
+    maxpool_partitions = list(itertools.chain.from_iterable(module_partitions.values()))
+    annotated_partitions = []
+    for maxpool_partition in maxpool_partitions:
+        annotated_partitions.append(maxpool_partition.nodes)
+        output_node = maxpool_partition.output_nodes[0]
+        maxpool_node = None
+        for n in maxpool_partition.nodes:
+            if n.target == torch.ops.aten.max_pool2d.default:
+                maxpool_node = n
+        assert (
+            maxpool_node is not None
+        ), "XNNPACKQuantizer only works with torch.ops.aten.max_pool2d.default, "
+        "please make sure you are exporting the model correctly"
+        if _is_annotated([output_node, maxpool_node]):  # type: ignore[list-item]
+            continue
+
+        input_act = maxpool_node.args[0]  # type: ignore[union-attr]
+        assert isinstance(input_act, Node)
+
+        # only annotate maxpool when the output of the input node is annotated
+        if (
+            "quantization_annotation" not in input_act.meta
+            or not input_act.meta["quantization_annotation"]._annotated
+            or input_act.meta["quantization_annotation"].output_qspec is None
+        ):
+            continue
+        # input and output of maxpool will share quantization parameter with input of maxpool
+        act_qspec = SharedQuantizationSpec(input_act)
+        # act_qspec = get_act_qspec(quantization_config)
+        maxpool_node.meta["quantization_annotation"] = QuantizationAnnotation(  # type: ignore[union-attr]
+            input_qspec_map={
+                input_act: act_qspec,
+            },
+            _annotated=True,
+        )
+        output_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+@register_annotator("adaptive_avg_pool2d")
+def _annotate_adaptive_avg_pool2d(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """Always annotate adaptive_avg_pool2d op"""
+    module_partitions = get_source_partitions(
+        gm.graph, [torch.nn.AdaptiveAvgPool2d, F.adaptive_avg_pool2d], filter_fn
+    )
+    partitions = list(itertools.chain.from_iterable(module_partitions.values()))
+    annotated_partitions = []
+    for partition in partitions:
+        pool_node = partition.output_nodes[0]
+        if (
+            pool_node.op != "call_function"
+            or pool_node.target != torch.ops.aten.adaptive_avg_pool2d.default
+        ):
+            raise ValueError(f"{pool_node} is not an aten adaptive_avg_pool2d operator")
+
+        if _is_annotated([pool_node]):
+            continue
+
+        annotated_partitions.append(partition.nodes)
+        input_act = pool_node.args[0]
+        assert isinstance(input_act, Node)
+
+        # only annotate input output sharing operator
+        # when the output of the input node is annotated
+        if (
+            "quantization_annotation" not in input_act.meta
+            or not input_act.meta["quantization_annotation"]._annotated
+            or input_act.meta["quantization_annotation"].output_qspec is None
+        ):
+            input_act_qspec = get_input_act_qspec(quantization_config)
+        else:
+            input_act_qspec = SharedQuantizationSpec(input_act)
+
+        # output sharing with input
+        output_act_qspec = SharedQuantizationSpec((input_act, pool_node))
+        pool_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                input_act: input_act_qspec,
+            },
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+def _is_input_large_scalar(node: Node, gm: torch.fx.GraphModule):
+    """Check if input is a large scalar value. So that we can skip quantization for the node
+    since histc op (in HistogramObserver) only works for values up to certain upper bound
+    """
+    if node.op == "get_attr":
+        tensor = getattr(gm, node.target)  # type: ignore[arg-type]
+        # torch.histc works until this upper bound
+        HISTC_UPPER_BOUND = 3.4028235e15
+        return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND
+    return False
+
+
+def _is_input_non_float_tensor(node: Node):
+    """Check if the input is not a float tensor, so that we can skip quantization for the node
+    since observers only works with float Tensors
+    """
+    if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor):
+        return True
+    return node.meta["val"].dtype != torch.float32
+
+
+@register_annotator("add_relu")
+def _annotate_add_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    fused_partitions = find_sequential_partitions(
+        gm, [torch.add, torch.nn.ReLU], filter_fn=filter_fn
+    )
+    annotated_partitions = []
+    for fused_partition in fused_partitions:
+        add_partition, relu_partition = fused_partition
+        annotated_partitions.append(add_partition.nodes + relu_partition.nodes)
+        if len(relu_partition.output_nodes) > 1:
+            raise ValueError("Relu partition has more than one output node")
+        relu_node = relu_partition.output_nodes[0]
+        if len(add_partition.output_nodes) > 1:
+            raise ValueError("add partition has more than one output node")
+        add_node = add_partition.output_nodes[0]
+
+        if _is_annotated([relu_node, add_node]):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = add_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = add_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+
+        add_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+@register_annotator("add")
+def _annotate_add(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    add_partitions = get_source_partitions(
+        gm.graph, [operator.add, torch.add, operator.iadd], filter_fn
+    )
+    add_partitions = list(itertools.chain.from_iterable(add_partitions.values()))
+    annotated_partitions = []
+    for add_partition in add_partitions:
+        annotated_partitions.append(add_partition.nodes)
+        add_node = add_partition.output_nodes[0]
+        if _is_annotated([add_node]):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = add_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = add_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+
+        add_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+@register_annotator("mul_relu")
+def _annotate_mul_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    fused_partitions = find_sequential_partitions(
+        gm, [torch.mul, torch.nn.ReLU], filter_fn=filter_fn
+    )
+    annotated_partitions = []
+    for fused_partition in fused_partitions:
+        mul_partition, relu_partition = fused_partition
+        annotated_partitions.append(mul_partition.nodes + relu_partition.nodes)
+        if len(relu_partition.output_nodes) > 1:
+            raise ValueError("Relu partition has more than one output node")
+        relu_node = relu_partition.output_nodes[0]
+        if len(mul_partition.output_nodes) > 1:
+            raise ValueError("mul partition has more than one output node")
+        mul_node = mul_partition.output_nodes[0]
+
+        if _is_annotated([relu_node, mul_node]):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = mul_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = mul_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+
+        mul_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+@register_annotator("mul")
+def _annotate_mul(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    mul_partitions = get_source_partitions(
+        gm.graph, ["mul", "mul_", operator.mul, torch.mul, operator.imul], filter_fn
+    )
+    mul_partitions = list(itertools.chain.from_iterable(mul_partitions.values()))
+    annotated_partitions = []
+    for mul_partition in mul_partitions:
+        annotated_partitions.append(mul_partition.nodes)
+        mul_node = mul_partition.output_nodes[0]
+        if _is_annotated([mul_node]):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = mul_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = mul_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+
+        mul_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+# TODO: remove Optional in return type, fix annotated_partitions logic
+@register_annotator("cat")
+def _annotate_cat(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    cat_partitions = get_source_partitions(gm.graph, [torch.cat], filter_fn)
+    cat_partitions = list(itertools.chain.from_iterable(cat_partitions.values()))
+    annotated_partitions = []
+    for cat_partition in cat_partitions:
+        cat_node = cat_partition.output_nodes[0]
+        if _is_annotated([cat_node]):
+            continue
+
+        if cat_node.target != torch.ops.aten.cat.default:
+            # TODO: change this to AnnotationException
+            raise Exception(
+                f"Expected cat node: torch.ops.aten.cat.default, but found {cat_node.target}"
+                " please check if you are calling the correct capture API"
+            )
+
+        annotated_partitions.append(cat_partition.nodes)
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        inputs = cat_node.args[0]
+
+        input_qspec_map = {}
+        input_act0 = inputs[0]
+        if isinstance(input_act0, Node):
+            input_qspec_map[input_act0] = input_act_qspec
+
+        shared_with_input0_qspec = SharedQuantizationSpec((input_act0, cat_node))
+        for input_act in inputs[1:]:
+            input_qspec_map[input_act] = shared_with_input0_qspec
+
+        output_act_qspec = shared_with_input0_qspec
+
+        cat_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+def _is_share_obs_or_fq_op(op: Callable) -> bool:
+    return op in [
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+        torch.ops.aten.mean.default,
+        torch.ops.aten.mean.dim,
+        torch.ops.aten.permute.default,
+        torch.ops.aten.permute_copy.default,
+        torch.ops.aten.squeeze.dim,
+        torch.ops.aten.squeeze_copy.dim,
+        # TODO: remove?
+        torch.ops.aten.adaptive_avg_pool2d.default,
+        torch.ops.aten.view_copy.default,
+        torch.ops.aten.view.default,
+        torch.ops.aten.slice_copy.Tensor,
+        torch.ops.aten.flatten.using_ints,
+    ]
+
+
+def propagate_annotation(model: torch.fx.GraphModule) -> None:
+    for n in model.graph.nodes:
+        if n.op != "call_function" or not _is_share_obs_or_fq_op(n.target):
+            continue
+
+        prev_node = n.args[0]
+        if not isinstance(prev_node, Node):
+            continue
+
+        quantization_annotation = prev_node.meta.get("quantization_annotation", None)
+        if not quantization_annotation:
+            continue
+
+        output_qspec = quantization_annotation.output_qspec
+        if not output_qspec:
+            continue
+
+        # make sure current node is not annotated
+        if (
+            "quantization_annotation" in n.meta
+            and n.meta["quantization_annotation"]._annotated
+        ):
+            continue
+
+        shared_qspec = SharedQuantizationSpec(prev_node)
+        # propagate the previous output_qspec to the current node
+        n.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                prev_node: shared_qspec,
+            },
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+
+# TODO: make the list of ops customizable
+def _convert_scalars_to_attrs(model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for n in model.graph.nodes:
+        if n.op != "call_function" or n.target not in [
+            torch.ops.aten.add.Tensor,
+            torch.ops.aten.mul.Tensor,
+        ]:
+            continue
+        args = list(n.args)
+        new_args = []
+        for i in range(len(args)):
+            if isinstance(args[i], torch.fx.Node):
+                new_args.append(args[i])
+                continue
+            prefix = "_tensor_constant_"
+            get_new_attr_name = get_new_attr_name_with_prefix(prefix)
+            tensor_constant_name = get_new_attr_name(model)
+            float_tensor = torch.tensor(float(args[i]))
+            model.register_buffer(tensor_constant_name, float_tensor)
+            fake_mode = n.meta["val"].fake_mode
+            with model.graph.inserting_before(n):
+                get_attr_node = model.graph.create_node(
+                    "get_attr", tensor_constant_name, (), {}
+                )
+                get_attr_node.meta["val"] = fake_mode.from_tensor(
+                    float_tensor, static_shapes=True
+                )
+                new_args.append(get_attr_node)
+        n.args = tuple(new_args)
+    model.recompile()
+    return model
diff --git a/backends/arm/operators/op_addmm.py b/backends/arm/operators/op_addmm.py
index 959b2034f2..cc49e5c382 100644
--- a/backends/arm/operators/op_addmm.py
+++ b/backends/arm/operators/op_addmm.py
@@ -65,7 +65,15 @@ def define_node(
         stride_attr = [1, 1]
         dilation_attr = [1, 1]
 
-        input_zp = -128 if is_quant_node else 0
+        input_zp = 0
+        if is_quant_node:
+            input_node = node.all_input_nodes[1]
+            # rank > 2 linear layer
+            if input_node.target == exir_ops.edge.aten.view_copy.default:
+                quant_node = input_node.all_input_nodes[0]
+            else:
+                quant_node = input_node
+            input_zp = get_quant_node_args(quant_node)[1]
         attr.ConvAttribute(
             pad=pad_attr,
             stride=stride_attr,
diff --git a/backends/arm/operators/op_placeholder.py b/backends/arm/operators/op_placeholder.py
index f935922bc2..597839021f 100644
--- a/backends/arm/operators/op_placeholder.py
+++ b/backends/arm/operators/op_placeholder.py
@@ -38,9 +38,14 @@ def process_placeholder(
         if consumer_node.target in dq_q_ops:
             _, weight_node_scale, weight_node_zp, _, _, _ = getNodeArgs(consumer_node)
 
+            int8_max = np.iinfo(np.int8).max
+            int8_min = np.iinfo(np.int8).min
             parameter_values_quantized = (
-                (parameter_values / weight_node_scale.number) + weight_node_zp.number
-            ).astype(np.int8)
+                ((parameter_values / weight_node_scale.number) + weight_node_zp.number)
+                .round()
+                .clip(int8_min, int8_max)
+                .astype(np.int8)
+            )
             tosa_graph.addConst(
                 inputs[0].shape,
                 ts.DType.INT8,
@@ -63,8 +68,10 @@ def process_placeholder(
             weight_node_scale, weight_node_zp = get_quant_node_args(weight_node)
 
             bias_values_quantized = (
-                parameter_values / (input_node_scale * weight_node_scale)
-            ).astype(np.int32)
+                (parameter_values / (input_node_scale * weight_node_scale))
+                .round()
+                .astype(np.int32)
+            )
 
             tosa_graph.addConst(
                 inputs[0].shape,
@@ -86,8 +93,8 @@ def process_placeholder(
             weight_node_scale, _ = get_quant_node_args(weight_node)
 
             bias_scales = input_node_scale * weight_node_scale
-            parameter_values_quantized = (parameter_values / bias_scales).astype(
-                np.int32
+            parameter_values_quantized = (
+                (parameter_values / bias_scales).round().astype(np.int32)
             )
 
             tosa_graph.addConst(
diff --git a/backends/arm/test/arm_tosa_reference.py b/backends/arm/test/arm_tosa_reference.py
index 4a6bbf44f9..0abec37cc5 100644
--- a/backends/arm/test/arm_tosa_reference.py
+++ b/backends/arm/test/arm_tosa_reference.py
@@ -41,8 +41,6 @@
 SUPPORTED_BI_TEST_LIST = [
     "simple_add",
     "simple_add_broadcast",
-    "simple_linear",
-    "simple_linear_rank4",
     "simple_conv2d_3x3_1x3x256x256_stride1",
     "simple_conv2d_1x1_1x2x128x128_stride1",
     "simple_conv2d_2x2_1x1x14x14_stride2",
@@ -250,9 +248,8 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
         # Need to dequant back to FP32 for running comparison with Torch output
         if profile is TosaProfile.BI:
             tosa_output = (
-                np.round(tosa_output - output_quantization_zp)
-                * output_quantization_scale
-            )
+                tosa_output - output_quantization_zp
+            ) * output_quantization_scale
 
     ## Read the Torch Output
     torch_file = open(TORCH_OUT_PATH + "/torch_output.npy", "rb")
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index 961507a863..bd33ee405a 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -40,8 +40,8 @@ def test_mv2_tosa_MI(self):
                 backend=ArmBackendSelector.TOSA,
             )
             .export()
-            .check(list(self.all_operators))
             .to_edge()
+            .check(list(self.all_operators))
             .partition()
             .to_executorch()
             .run_method()
@@ -59,8 +59,8 @@ def test_mv2_tosa_BI(self):
             )
             .quantize()
             .export()
-            .check(list(self.all_operators))
             .to_edge()
+            .check(list(self.all_operators))
             .partition()
             .to_executorch()
             .run_method()
@@ -78,8 +78,8 @@ def test_mv2_u55_BI(self):
             )
             .quantize()
             .export()
-            .check(list(self.all_operators))
             .to_edge()
+            .check(list(self.all_operators))
             .partition()
             .to_executorch()
         )
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index a7fa07dd05..0791bab5eb 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -88,7 +88,7 @@ def _test_add_tosa_BI_pipeline(
             .to_executorch()
         )
         if TOSA_REF_MODEL_INSTALLED:
-            tester.run_method().compare_outputs()
+            tester.run_method().compare_outputs(qtol=1)
         else:
             logger.warning(
                 "TOSA ref model tool not installed, skip numerical correctness tests"
@@ -118,8 +118,6 @@ def test_add_tosa_MI(self):
         test_data = (torch.randn(4, 4, 4),)
         self._test_add_tosa_MI_pipeline(self.Add(), test_data)
 
-    # TODO: Will this type of parametrization be supported? pytest seem
-    # have issue with it.
     @parameterized.expand(
         [
             (torch.ones(5),),  # test_data
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
new file mode 100644
index 0000000000..8cbd41e012
--- /dev/null
+++ b/backends/arm/test/ops/test_linear.py
@@ -0,0 +1,232 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import shutil
+import unittest
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test.test_models import TosaProfile
+from executorch.backends.arm.test.tester.arm_tester import ArmBackendSelector, ArmTester
+from parameterized import parameterized
+
+# TODO: fixme! These globs are a temporary workaround. Reasoning:
+# Running the jobs in _unittest.yml will not work since that environment don't
+# have the vela tool, nor the tosa_reference_model tool. Hence, we need a way to
+# run what we can in that env temporarily. Long term, vela and tosa_reference_model
+# should be installed in the CI env.
+TOSA_REF_MODEL_INSTALLED = shutil.which("tosa_reference_model")
+VELA_INSTALLED = shutil.which("vela")
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+seed = 42
+torch.manual_seed(seed)
+test_data_suite = [
+    # (test_name, test_data, out_features)
+    (
+        "model_linear_rank1_zeros",
+        torch.zeros(10, 10),
+        10,
+    ),
+    (
+        "model_linear_rank1_ones",
+        torch.ones(10, 10),
+        10,
+    ),
+    (
+        "model_linear_rank1_negative_ones",
+        torch.ones(10, 10) * (-1),
+        10,
+    ),
+    (
+        "model_linear_rank1_rand",
+        torch.rand(10, 10),
+        10,
+    ),
+    (
+        "model_linear_rank1_negative_large_rand",
+        torch.rand(10, 10) * (-100),
+        10,
+    ),
+    (
+        "model_linear_rank1_large_randn",
+        torch.randn(10, 10) * 100,
+        10,
+    ),
+    (
+        "model_linear_rank4_zeros",
+        torch.zeros(5, 10, 25, 20),
+        30,
+    ),
+    (
+        "model_linear_rank4_ones",
+        torch.ones(5, 10, 25, 20),
+        30,
+    ),
+    (
+        "model_linear_rank4_negative_ones",
+        torch.ones(5, 10, 25, 20) * (-1),
+        30,
+    ),
+    (
+        "model_linear_rank4_rand",
+        torch.rand(5, 10, 25, 20),
+        30,
+    ),
+    (
+        "model_linear_rank4_negative_large_rand",
+        torch.rand(5, 10, 25, 20) * (-100),
+        30,
+    ),
+    (
+        "model_linear_rank4_large_randn",
+        torch.randn(5, 10, 25, 20) * 100,
+        30,
+    ),
+]
+
+
+class TestLinear(unittest.TestCase):
+    class Linear(torch.nn.Module):
+        def __init__(
+            self,
+            in_features: int,
+            out_features: int = 3,
+            bias: bool = True,
+        ):
+            super().__init__()
+            self.fc = torch.nn.Linear(
+                in_features=in_features,
+                out_features=out_features,
+                bias=bias,
+            )
+
+        def forward(self, x):
+            return self.fc(x)
+
+    def _test_linear_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        tester = (
+            ArmTester(
+                module,
+                inputs=test_data,
+                profile=TosaProfile.MI,
+                backend=ArmBackendSelector.TOSA,
+            )
+            .export()
+            .check_count({"torch.ops.aten.addmm.default": 1})
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+        if TOSA_REF_MODEL_INSTALLED:
+            tester.run_method().compare_outputs()
+        else:
+            logger.warning(
+                "TOSA ref model tool not installed, skip numerical correctness tests"
+            )
+
+    def _test_linear_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        tester = (
+            ArmTester(
+                module,
+                inputs=test_data,
+                profile=TosaProfile.BI,
+                backend=ArmBackendSelector.TOSA,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.addmm.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+        if TOSA_REF_MODEL_INSTALLED:
+            tester.run_method().compare_outputs(qtol=True)
+        else:
+            logger.warning(
+                "TOSA ref model tool not installed, skip numerical correctness tests"
+            )
+
+    def _test_linear_tosa_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                inputs=test_data,
+                profile=TosaProfile.BI,
+                backend=ArmBackendSelector.ETHOS_U55,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.addmm.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_linear_tosa_MI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        out_features: int,
+    ):
+        in_features = test_data.shape[-1]
+        test_data = (test_data,)
+        self._test_linear_tosa_MI_pipeline(
+            self.Linear(
+                in_features=in_features,
+                out_features=out_features,
+            ),
+            test_data,
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_linear_tosa_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        out_features: int,
+    ):
+        in_features = test_data.shape[-1]
+        test_data = (test_data,)
+        self._test_linear_tosa_BI_pipeline(
+            self.Linear(in_features=in_features, out_features=out_features), test_data
+        )
+
+    @parameterized.expand(test_data_suite)
+    @unittest.skip("This does not work as of now")
+    def test_linear_tosa_u55_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        out_features: int,
+    ):
+        in_features = test_data.shape[-1]
+        test_data = (test_data,)
+        self._test_linear_tosa_u55_BI_pipeline(
+            self.Linear(
+                in_features=in_features,
+                out_features=out_features,
+            ),
+            test_data,
+        )
diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py
index fbe8806d9b..e7842358da 100644
--- a/backends/arm/test/test_models.py
+++ b/backends/arm/test/test_models.py
@@ -125,38 +125,6 @@ def __init__(self):
         def forward(self, x, y):
             return x + y
 
-    @register_test
-    class simple_linear(torch.nn.Module):
-        inputs = {
-            TosaProfile.BI: (torch.rand(1, 2),),
-            TosaProfile.MI: (torch.rand(1, 2),),
-        }
-
-        def __init__(self):
-            super().__init__()
-            torch.manual_seed(seed)
-            self.fc = torch.nn.Linear(2, 3)
-
-        def forward(self, x):
-            x = self.fc(x)
-            return x
-
-    @register_test
-    class simple_linear_rank4(torch.nn.Module):
-        inputs = {
-            TosaProfile.BI: (torch.rand(5, 10, 25, 20),),
-            TosaProfile.MI: (torch.rand(5, 10, 25, 20),),
-        }
-
-        def __init__(self):
-            super().__init__()
-            torch.manual_seed(42)
-            self.fc = torch.nn.Linear(20, 30)
-
-        def forward(self, x):
-            x = self.fc(x)
-            return x
-
     """Currenly we compare the quantized result directly with the floating point result, to avoid a noticable
        precision difference due to wide random numerical distribution, generate small random value range for
        convolution testing instead for now"""
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 35fd85d66b..6dea3b04b6 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from enum import Enum
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 from executorch.backends.arm.arm_backend import (
@@ -15,6 +15,7 @@
 from executorch.backends.arm.arm_partitioner import ArmPartitioner
 
 from executorch.backends.arm.test.tosautil.tosa_test_utils import (
+    QuantizationParams,
     TosaProfile,
     TosaTestUtils,
 )
@@ -32,6 +33,7 @@
     get_symmetric_quantization_config,
     XNNPACKQuantizer,
 )
+from torch.export import ExportedProgram
 
 
 class ArmBackendSelector(Enum):
@@ -61,6 +63,7 @@ def __init__(
                 TosaProfile.BI or TosaProfile.MI
         """
         self.tosa_test_util = None
+        self.is_quantized = profile == TosaProfile.BI
         if backend == ArmBackendSelector.TOSA:
             self.tosa_test_util = TosaTestUtils(profile=profile)
             # The spec below tiggers arm_backend.py to output two files:
@@ -119,54 +122,121 @@ def run_method(
         ), "self.tosa_test_util is not initialized, cannot use run_method()"
         inputs_to_run = inputs or self.inputs
 
-        # TODO: we can't possible need to use all these stages??
-        export_stage = self.stages[
-            self.stage_name(Export)
-        ]  # this is what XNNpack use to get quant params
-        toedge_stage = self.stages[
-            self.stage_name(ToEdge)
-        ]  # this is what get_input_quantization_params use to get quant params
-        partition_stage = self.stages[
-            self.stage_name(Partition)
-        ]  # this is what tosa_ref_dump_inputs use....
-
-        # TODO: I'd prefer to use this TOSA buffer instead of output.tosa,
-        # generated by arm_backend.py. The issue is that we're still depending
-        # on desc.json, which is created from TosaSerializer class, not from
-        # the serialized TOSA buffer. Leave this here for review purposes.
-        # ts_serialized = self._get_serialized_tosa_buffer(  # unused
-        #     partition_stage.artifact
-        # )
-
-        # This is where the torch reference output is calculated and set
-        # TODO: This sets self.quantization_scale, which is duplicates
-        # self.tosa_test_util.quantization.output.scales (?). Fixme.
-        (
-            self.reference_output,
-            self.quantization_scale,
-        ) = self._calculate_reference_output(export_stage.artifact, inputs_to_run)
-
-        # Convert the torch inputs to something TOSA ref model can use
-        tensor_names_and_inputs_np = self.tosa_test_util.convert_inputs_to_tosa(
-            partition_stage.artifact, toedge_stage.artifact, inputs_to_run
+        export_stage = self.stages[self.stage_name(Export)]
+
+        (input_names, qp_input) = self._get_input_params(export_stage.artifact)
+        (output_name, qp_output) = self._get_output_param(export_stage.artifact)
+
+        # Calculate the reference output using the original module or the quant
+        # module. self.quantization_scale is used by compare_outputs() to
+        # calculate the tolerance
+        self.quantization_scale = None if qp_output is None else qp_output.scale
+        if self.is_quantized:
+            module_for_ref = self.stages[self.stage_name(Quantize)].artifact
+        else:
+            module_for_ref = self.original_module
+        self.reference_output = self._calculate_reference_output(
+            module_for_ref, inputs_to_run
         )
 
         # Run the TOSA ref model to get the output tensor, which will be
         # compared to the torch output in compare_outputs()
         self.stage_output = self.tosa_test_util.run_tosa_ref_model(
-            tensor_names_and_inputs_np
+            params_input=(input_names, qp_input),
+            param_output=(output_name, qp_output),
+            inputs=inputs_to_run,
         )
 
         return self
 
-    def _get_serialized_tosa_buffer(self, partition_stage: Partition) -> bytes:
+    def _get_input_params(
+        self, program: ExportedProgram
+    ) -> Tuple[str, Union[List[QuantizationParams], List[None]]]:
         """
-        This is just a prototype...
-        Todo:
-            * The "_0" indicates that there are many lowered modules. Loop it!
-            * There's probably a better way to get this buffer. An API? Yes,
-              it seems the serialize stage does this for you...
+        Get name and optionally quantization parameters for the inputs to this
+        model.
+
+        Args:
+            program (ExportedProgram): The program to get input parameters from
+        Returns:
+            Tuple[str, Optional[QuantizationParams]]: A tuple containing the
+                input node names and their quantization parameters.
+        """
+        input_names = []
+        # E.g. bias and weights are 'placeholders' as well. This is used to
+        # get only the use inputs.
+        usr_inputs = program.graph_signature.user_inputs
+        for node in program.graph.nodes:
+            if node.op == "placeholder" and node.name in usr_inputs:
+                input_names.append(node.name)
+                continue
+
+        if self.is_quantized:
+            quant_params = []
+            for node in program.graph.nodes:
+                if (
+                    node.target
+                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                    and node.args[0].name in input_names
+                ):
+                    qp = QuantizationParams(
+                        node_name=node.args[0].name, scale=node.args[1], zp=node.args[2]
+                    )
+                    quant_params.append(qp)
+                    if len(quant_params) == len(
+                        input_names
+                    ):  # break early if we have all the inputs quantized parameters
+                        break
+            assert len(quant_params) != 0, "Quantization paramerters not found"
+            return (input_names, quant_params)
+        else:
+            return (input_names, len(input_names) * [None])  # return a list of None's
+
+    def _get_output_param(
+        self, program: ExportedProgram
+    ) -> Tuple[str, Union[QuantizationParams, None]]:
         """
-        return partition_stage._edge_programs[
-            "forward"
-        ]._graph_module.lowered_module_0.processed_bytes
+        Get name and optionally quantization parameters for the inputs to this
+        model.
+
+        Args:
+            program (ExportedProgram): The program to get output parameters from.
+        Returns:
+            Tuple[str, Optional[QuantizationParams]]: A tuple containing the
+                output node name and its quantization parameters.
+        """
+        output_node = None
+        for node in program.graph.nodes:
+            if node.op == "output":
+                output_node = node
+                break
+
+        if self.is_quantized:
+            quant_params = None
+            for node in program.graph.nodes:
+                if (
+                    node.target
+                    == torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                    and node == output_node.args[0][0]
+                ):
+                    quant_params = QuantizationParams(
+                        node_name=node.args[0].name, scale=node.args[1], zp=node.args[2]
+                    )
+                    break  # break early, there's only one output node
+            assert quant_params is not None, "Quantization paramerters not found"
+            return (output_node.name, quant_params)
+        else:
+            return (output_node.name, None)
+
+    @staticmethod
+    def _calculate_reference_output(
+        module: Union[torch.fx.GraphModule, torch.nn.Module], inputs
+    ) -> torch.Tensor:
+        """
+        Note: I'd prefer to use the base class method here, but since it use the
+        exported program, I can't. The partitioner stage clears the state_dict
+        of the exported program, which causes an issue when evaluating the
+        module.
+        """
+
+        return module.forward(*inputs)
diff --git a/backends/arm/test/tosautil/tosa_test_utils.py b/backends/arm/test/tosautil/tosa_test_utils.py
index 48bbb6311f..ff39761559 100644
--- a/backends/arm/test/tosautil/tosa_test_utils.py
+++ b/backends/arm/test/tosautil/tosa_test_utils.py
@@ -10,34 +10,27 @@
 import subprocess
 import tempfile
 
-from collections import namedtuple
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
 
 import numpy as np
 import torch
-from executorch.backends.arm.test.arm_tosa_reference import (
-    get_input_quantization_params,  # TODO: remove this dependecy
-    get_output_quantization_param,  # TODO: remove this dependecy
-    tosa_ref_dump_inputs,  # TODO: remove this dependecy
-)
 
 from executorch.backends.arm.test.test_models import TosaProfile
-from executorch.backends.xnnpack.test.tester.tester import Partition, ToEdge
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.WARNING)
 
 
 class QuantizationParams:
-    __slots__ = ["zps", "scales"]
+    __slots__ = ["node_name", "zp", "scale"]
 
-    def __init__(self, zps: Union[Dict, List[int]], scales: Union[Dict, List[float]]):
-        self.zps = zps
-        self.scales = scales
+    # todo: zps and scales can be per tensors or per channel => a list??
+    def __init__(self, node_name: str, zp: int, scale: float):
+        self.node_name = node_name  # not need I think, but good for error check
+        self.zp = zp
+        self.scale = scale
 
 
-Quantization = namedtuple("Quantization", ["input", "output"])
-
 """
 This class is used to work with TOSA artifacts.
 """
@@ -55,9 +48,6 @@ def __init__(
         )
         self.tosa_ref_model_path = tosa_ref_model_path or "tosa_reference_model"
         self.profile = profile or TosaProfile.MI
-        input_quant = QuantizationParams(zps={}, scales={})
-        output_quant = QuantizationParams(zps={}, scales={})
-        self.quantization = Quantization(input=input_quant, output=output_quant)
         assert os.path.exists(
             self.intermediate_path
         ), f"TOSA artifact path don't exist! Path: {self.intermediate_path}"
@@ -106,73 +96,11 @@ def dbg_dump_readble_tosa_file(self) -> None:
         self._run_cmd(cmd_flatc)
         return
 
-    def convert_inputs_to_tosa(
-        self,
-        partition_stage: Partition,
-        toedge_stage: ToEdge,
-        inputs_to_run: Tuple[torch.Tensor],
-    ) -> List[Tuple[np.ndarray, str]]:
-        """
-        Convert input tensors to numpy and save them to disk as .npy files. The
-        TOSA reference model will use these files as input....
-
-        Args:
-        partition_stage (Partition): The partition stage.
-        toedge_stage (ToEdge): The toedge stage.
-        inputs_to_run (Tuple[torch.Tensor]): The input tensors to convert.
-
-        Returns:
-        List[Tuple[np.ndarray, str]]: A list of tuples, where each tuple contain
-            a numpy array and the name of the tensor.
-
-        Todo:
-            * I'd like to use some other common function instead of
-              get_input_quantization_params() and
-              get_output_quantization_param().
-            * I'd like to get rid of the call to tosa_ref_dump_inputs as well.
-              All this function is doing is to convert to numpy and save to disk
-        """
-
-        if self.profile == TosaProfile.BI:
-            # TODO: Unclear to me why we need to pass toedge_stage here. Ideally
-            # we shouldn't get the quantization params here at all, but rather
-            # from the Quantizer...
-            (
-                self.quantization.input.scales,
-                self.quantization.input.zps,
-            ) = get_input_quantization_params(toedge_stage)
-            (
-                self.quantization.output.scales,
-                self.quantization.output.zps,
-            ) = get_output_quantization_param(toedge_stage)
-
-            # TODO: I think it should be possible to get this input data from
-            # somewhere else. Why do I need to call this just to get a npy file,
-            # which is just a quantized version of the input..?
-            np_data_and_tensor_names = tosa_ref_dump_inputs(
-                partition_stage,
-                inputs_to_run,
-                self.intermediate_path,
-                self.quantization.input.scales,
-                self.quantization.input.zps,
-                self.profile,
-                save_on_disk=False,  # If True - this one produces arg0_1.npy, which is just a quant version of the input
-                # inputs_to_run -> convert to numpy -> do "manual" quantization -> save to arg0_1.npy (TODO: remove this comment)
-            )
-        else:
-            np_data_and_tensor_names = tosa_ref_dump_inputs(
-                partition_stage,
-                inputs_to_run,
-                self.intermediate_path,
-                {},
-                {},
-                save_on_disk=False,
-            )
-
-        return np_data_and_tensor_names
-
     def run_tosa_ref_model(
-        self, tensor_names_and_inputs: List[Tuple[np.array, str]]
+        self,
+        params_input: Tuple[List[str], List[QuantizationParams]],
+        param_output: Tuple[str, QuantizationParams],
+        inputs: Tuple[torch.Tensor],
     ) -> torch.Tensor:
         """
         Run TOSA reference model using the tosa_refence_model program.
@@ -184,18 +112,20 @@ def run_tosa_ref_model(
 
         These two files are created by arm_backend.py as part of partition stage
 
-        3. An IFM file containing input data, saved as .npy. This file is
-           created by tosa_ref_dump_inputs()
-
         All these files are saved on disk in self.intermediate_path.
 
         Args:
-        tensor_names_and_inputs (List[Tuple[np.array, str]]): A list of tuples
-            where each tuple contains inputs (as numpy array) and the name of
-            the tensor.
+            params_input (Tuple[List[str], List[QuantizationParams]]): A tuple
+                containing a list of input node names and a list of their
+                quantization parameters (if model is quantized).
+            param_output (Tuple[str, QuantizationParams]): A tuple containing
+                the output node name and its quantization parameters (if
+                model is quantized).
+            inputs (Tuple[torch.Tensor]): The input data to run the TOSA
 
         Returns:
-        torch.Tensor: The output of the TOSA reference model, as a torch tensor.
+            torch.Tensor: The output of the TOSA reference model, as a torch
+                tensor.
 
         Here's a sample desc.json file:
         {
@@ -228,11 +158,26 @@ def run_tosa_ref_model(
         ), f"desc_file_path: {desc_file_path} does not exist"
 
         # Save the input data to disk as a .npy file, since that's what the TOSA
-        # reference model expects. Name of the file is must match the name in
+        # reference model expects. Name of the file must match the name in
         # desc.json, which is the tensor name from the graph + .npy
-        for tensor_name, data in tensor_names_and_inputs:
-            file_path = os.path.join(self.intermediate_path, tensor_name + ".npy")
-            np.save(file_path, data, allow_pickle=False)
+        for input_name, quant_param, data in zip(
+            params_input[0], params_input[1], inputs
+        ):
+            data_np = data.detach().numpy()
+            if self.profile is TosaProfile.BI:
+                assert (
+                    quant_param.node_name == input_name
+                ), "These quantization params do not match the input tensor name"
+                int8_max = np.iinfo(np.int8).max
+                int8_min = np.iinfo(np.int8).min
+                data_np = (
+                    ((data_np / np.float32(quant_param.scale)) + quant_param.zp)
+                    .round()
+                    .clip(int8_min, int8_max)
+                    .astype(np.int8)
+                )
+            file_path = os.path.join(self.intermediate_path, input_name + ".npy")
+            np.save(file_path, data_np, allow_pickle=False)
 
         # Run the TOSA reference model via command line, this will produce a
         # .npy file with the result (aka OFM).
@@ -252,12 +197,11 @@ def run_tosa_ref_model(
 
         if self.profile is TosaProfile.BI:
             # Need to dequant back to FP32 for comparison with torch output
-            assert self.quantization.output.scales is not None
-            assert self.quantization.output.zps is not None
-            tosa_ref_output = (
-                np.round(tosa_ref_output - self.quantization.output.zps)
-                * self.quantization.output.scales
-            )
+            quant_param = param_output[1]
+            assert (
+                quant_param is not None
+            ), "There are no qunatization parameters, check output parameters"
+            tosa_ref_output = (tosa_ref_output - quant_param.zp) * quant_param.scale
 
         # tosa_output is a numpy array, convert to torch tensor for comparison
         tosa_ref_output = torch.from_numpy(tosa_ref_output.astype("float32"))
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index 834f2704fc..24855545fe 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -8,7 +8,7 @@
 #
 # This file should be formatted with
 # ~~~
-# cmake-format --first-comment-is-literal=True CMakeLists.txt
+# cmake-format --first-comment-is-literal=True -i CMakeLists.txt
 # ~~~
 # It should also be cmake-lint clean.
 #
@@ -32,7 +32,10 @@ if(NOT FLATC_EXECUTABLE)
   set(FLATC_EXECUTABLE flatc)
 endif()
 
-# Include this file to access target_link_options_shared_lib
+# Include this file to access target_link_options_shared_lib This is required to
+# provide access to target_link_options_shared_lib which allows libraries to be
+# linked with the --whole-archive flag. This is required for libraries that
+# perform dynamic registration via static initialization.
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
 # ATen Vulkan Libs
@@ -48,12 +51,19 @@ set(COMMON_INCLUDES ${VULKAN_API_HEADERS} ${EXECUTORCH_ROOT}/..)
 file(GLOB_RECURSE vulkan_graph_cpp ${RUNTIME_PATH}/graph/*)
 
 add_library(vulkan_graph_lib STATIC ${vulkan_graph_cpp})
-
 target_include_directories(vulkan_graph_lib PRIVATE ${COMMON_INCLUDES})
+target_link_libraries(${LIBRARY_NAME} vulkan_api_lib)
+target_compile_options(vulkan_graph_lib PRIVATE ${VULKAN_CXX_FLAGS})
+# Link this library with --whole-archive due to dynamic operator registrations
+target_link_options_shared_lib(vulkan_graph_lib)
 
-target_link_libraries(vulkan_graph_lib vulkan_shader_lib)
+# Due to dynamic registrations, these libraries must be explicitly linked
+set(VULKAN_STANDARD_OPS_LIBS vulkan_graph_lib vulkan_graph_shaderlib)
 
-target_compile_options(vulkan_graph_lib PRIVATE ${VULKAN_CXX_FLAGS})
+# vulkan_graph_shaderlib
+
+set(VULKAN_GRAPH_SHADERS_PATH ${RUNTIME_PATH}/graph/ops/glsl/)
+vulkan_shader_library(${VULKAN_GRAPH_SHADERS_PATH} vulkan_graph_shaderlib)
 
 # Generate Files from flatc
 
@@ -85,19 +95,12 @@ target_include_directories(
 file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp)
 
 add_library(vulkan_backend ${vulkan_backend_cpp})
-
-target_include_directories(vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR})
-target_include_directories(vulkan_backend PRIVATE ${COMMON_INCLUDES})
-
-target_link_libraries(vulkan_backend PRIVATE vulkan_graph_lib)
-target_link_libraries(vulkan_backend PRIVATE vulkan_schema)
-target_link_libraries(vulkan_backend PRIVATE executorch)
-
+target_include_directories(vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR}
+                                                  ${COMMON_INCLUDES})
+target_link_libraries(vulkan_backend PRIVATE vulkan_graph_lib vulkan_schema
+                                             executorch)
 target_compile_options(vulkan_backend PRIVATE ${VULKAN_CXX_FLAGS})
-
-# This is required to ensure that vulkan_backend gets linked with
-# --whole-archive since backends are registered via static variables that would
-# otherwise be discarded
+# Link this library with --whole-archive due to dynamic backend registration
 target_link_options_shared_lib(vulkan_backend)
 
 # Executor Runner
@@ -105,28 +108,38 @@ target_link_options_shared_lib(vulkan_backend)
 if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*iOS\.cmake$")
   set(VULKAN_RUNNER_SRCS ${_executor_runner__srcs})
   list(TRANSFORM VULKAN_RUNNER_SRCS PREPEND "${EXECUTORCH_ROOT}/")
+
   add_executable(vulkan_executor_runner ${VULKAN_RUNNER_SRCS})
-  target_link_libraries(vulkan_executor_runner ${_executor_runner_libs})
-  target_link_libraries(vulkan_executor_runner vulkan_schema)
-  target_link_libraries(vulkan_executor_runner vulkan_backend)
+  target_link_libraries(
+    vulkan_executor_runner ${_executor_runner_libs} vulkan_schema
+    vulkan_backend ${VULKAN_STANDARD_OPS_LIBS})
   target_compile_options(vulkan_executor_runner PUBLIC ${VULKAN_CXX_FLAGS})
 
   add_library(vulkan_executor_runner_lib STATIC ${VULKAN_RUNNER_SRCS})
-  target_link_libraries(vulkan_executor_runner_lib ${_executor_runner_libs})
-  target_link_libraries(vulkan_executor_runner_lib vulkan_schema)
-  target_link_libraries(vulkan_executor_runner_lib vulkan_backend)
+  target_link_libraries(vulkan_executor_runner_lib ${_executor_runner_libs}
+                        vulkan_schema vulkan_backend)
   target_compile_options(vulkan_executor_runner_lib PUBLIC ${VULKAN_CXX_FLAGS})
 endif()
 
 # Test targets
 
 if(EXECUTORCH_BUILD_GTESTS)
+  set(TEST_UTILS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/test/utils)
+  file(GLOB TEST_UTILS_CPP ${CMAKE_CURRENT_SOURCE_DIR}/test/utils/*.cpp)
+
+  set(TEST_SHADERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/test/glsl)
+  vulkan_shader_library(${TEST_SHADERS_PATH} test_shaderlib)
+
   # vulkan_compute_api_test
-  set(TEST_CPP ${CMAKE_CURRENT_SOURCE_DIR}/test/vulkan_compute_api_test.cpp)
-  add_executable(vulkan_compute_api_test ${TEST_CPP})
-  target_include_directories(vulkan_compute_api_test PRIVATE ${COMMON_INCLUDES})
-  target_link_libraries(vulkan_compute_api_test vulkan_api_lib)
-  target_link_libraries(vulkan_compute_api_test vulkan_graph_lib)
-  target_link_libraries(vulkan_compute_api_test gtest_main)
+  set(COMPUTE_API_TEST_CPP
+      ${CMAKE_CURRENT_SOURCE_DIR}/test/vulkan_compute_api_test.cpp)
+
+  add_executable(vulkan_compute_api_test ${COMPUTE_API_TEST_CPP}
+                                         ${TEST_UTILS_CPP})
+  target_include_directories(vulkan_compute_api_test
+                             PRIVATE ${COMMON_INCLUDES} ${TEST_UTILS_HEADERS})
+  target_link_libraries(
+    vulkan_compute_api_test PRIVATE gtest_main ${VULKAN_STANDARD_OPS_LIBS}
+                                    test_shaderlib)
   target_compile_options(vulkan_compute_api_test PRIVATE ${VULKAN_CXX_FLAGS})
 endif()
diff --git a/backends/vulkan/TARGETS b/backends/vulkan/TARGETS
index 86733510a3..5bd6cf12f6 100644
--- a/backends/vulkan/TARGETS
+++ b/backends/vulkan/TARGETS
@@ -3,7 +3,7 @@ load(":targets.bzl", "define_common_targets")
 
 oncall("executorch")
 
-define_common_targets()
+define_common_targets(is_fbcode = True)
 
 runtime.python_library(
     name = "vulkan_preprocess",
diff --git a/backends/vulkan/cmake/ATenVulkan.cmake b/backends/vulkan/cmake/ATenVulkan.cmake
index 6db7868150..41a28823f6 100644
--- a/backends/vulkan/cmake/ATenVulkan.cmake
+++ b/backends/vulkan/cmake/ATenVulkan.cmake
@@ -8,7 +8,7 @@
 #
 # This file should be formatted with
 # ~~~
-# cmake-format --first-comment-is-literal=True CMakeLists.txt
+# cmake-format --first-comment-is-literal=True -i ATenVulkan.cmake
 # ~~~
 # It should also be cmake-lint clean.
 #
@@ -22,20 +22,6 @@ if(NOT VULKAN_THIRD_PARTY_PATH)
   set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../third-party)
 endif()
 
-# Shader Codegen
-
-# Trigger Shader code generation
-set(USE_VULKAN ON)
-set(VULKAN_CODEGEN_CMAKE_PATH ${PYTORCH_PATH}/cmake/VulkanCodegen.cmake)
-if(NOT EXISTS ${VULKAN_CODEGEN_CMAKE_PATH})
-  message(
-    FATAL_ERROR
-      "Cannot perform SPIR-V codegen because " ${VULKAN_CODEGEN_CMAKE_PATH}
-      " does not exist. Please make sure that submodules are initialized"
-      " and updated.")
-endif()
-include(${PYTORCH_PATH}/cmake/VulkanCodegen.cmake)
-
 # Source paths and compile settings
 
 set(ATEN_PATH ${PYTORCH_PATH}/aten/src)
@@ -65,19 +51,56 @@ list(APPEND VULKAN_API_HEADERS ${VOLK_PATH})
 list(APPEND VULKAN_API_HEADERS ${VMA_PATH})
 
 target_include_directories(vulkan_api_lib PRIVATE ${VULKAN_API_HEADERS})
-
 target_compile_options(vulkan_api_lib PRIVATE ${VULKAN_CXX_FLAGS})
 
-# vulkan_shader_lib
+# Find GLSL compiler executable
+
+if(ANDROID)
+  if(NOT ANDROID_NDK)
+    message(FATAL_ERROR "ANDROID_NDK not set")
+  endif()
+
+  set(GLSLC_PATH
+      "${ANDROID_NDK}/shader-tools/${ANDROID_NDK_HOST_SYSTEM_NAME}/glslc")
+else()
+  find_program(
+    GLSLC_PATH glslc
+    PATHS ENV VULKAN_SDK
+    PATHS "$ENV{VULKAN_SDK}/${CMAKE_HOST_SYSTEM_PROCESSOR}/bin"
+    PATHS "$ENV{VULKAN_SDK}/bin")
+
+  if(NOT GLSLC_PATH)
+    message(FATAL_ERROR "USE_VULKAN glslc not found")
+  endif()
+endif()
+
+# Required to enable linking with --whole-archive
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
-file(GLOB VULKAN_IMPL_CPP ${ATEN_VULKAN_PATH}/impl/*.cpp)
+# Convenience macro to create a shader library
 
-add_library(vulkan_shader_lib STATIC ${VULKAN_IMPL_CPP} ${vulkan_generated_cpp})
+macro(vulkan_shader_library SHADERS_PATH LIBRARY_NAME)
+  set(VULKAN_SHADERGEN_ENV "")
+  set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/${LIBRARY_NAME})
 
-list(APPEND VULKAN_API_HEADERS ${CMAKE_BINARY_DIR}/vulkan)
+  execute_process(
+    COMMAND
+      "${PYTHON_EXECUTABLE}" ${PYTORCH_PATH}/tools/gen_vulkan_spv.py --glsl-path
+      ${SHADERS_PATH} --output-path ${VULKAN_SHADERGEN_OUT_PATH}
+      --glslc-path=${GLSLC_PATH} --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH}
+      --env ${VULKAN_GEN_ARG_ENV}
+    RESULT_VARIABLE error_code)
+  set(ENV{PYTHONPATH} ${PYTHONPATH})
 
-target_include_directories(vulkan_shader_lib PRIVATE ${VULKAN_API_HEADERS})
+  set(vulkan_generated_cpp ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp)
 
-target_link_libraries(vulkan_shader_lib vulkan_api_lib)
+  add_library(${LIBRARY_NAME} STATIC ${vulkan_generated_cpp})
+  target_include_directories(${LIBRARY_NAME} PRIVATE ${COMMON_INCLUDES})
+  target_link_libraries(${LIBRARY_NAME} vulkan_api_lib)
+  target_compile_options(${LIBRARY_NAME} PRIVATE ${VULKAN_CXX_FLAGS})
+  # Link this library with --whole-archive due to dynamic shader registrations
+  target_link_options_shared_lib(${LIBRARY_NAME})
 
-target_compile_options(vulkan_shader_lib PRIVATE ${VULKAN_CXX_FLAGS})
+  unset(VULKAN_SHADERGEN_ENV)
+  unset(VULKAN_SHADERGEN_OUT_PATH)
+endmacro()
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index 84c9a132e2..12c8696c21 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import operator
 from typing import final, List, Optional
 
 import torch
@@ -30,6 +31,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.mul.Tensor,
             exir_ops.edge.aten.sub.Tensor,
             exir_ops.edge.aten.pow.Tensor_Tensor,
+            operator.getitem,
         ]
         return supported
 
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 9c554a232c..62555adc73 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -16,6 +16,7 @@
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/platform/compiler.h>
 #include <executorch/runtime/platform/profiler.h>
 
@@ -23,12 +24,15 @@
 #include <cstdlib> /* strtol */
 #include <memory>
 #include <type_traits>
+#include <vector>
 
 namespace torch {
 namespace executor {
 namespace vulkan {
 namespace {
 
+using namespace at::native::vulkan;
+
 // Flatbuffer types
 using VkGraphPtr = const vkgraph::VkGraph*;
 using OpCallPtr = const vkgraph::OperatorCall*;
@@ -51,102 +55,274 @@ const uint8_t* getConstantDataPtr(
   return constant_data + constant_bytes->offset();
 }
 
-using namespace at::native::vulkan;
+api::ScalarType get_scalar_type(const vkgraph::VkDataType& vk_datatype) {
+  switch (vk_datatype) {
+    case vkgraph::VkDataType::BOOL:
+      return api::kBool;
+    case vkgraph::VkDataType::UINT8:
+      return api::kByte;
+    case vkgraph::VkDataType::INT8:
+      return api::kChar;
+    case vkgraph::VkDataType::INT32:
+      return api::kInt;
+    case vkgraph::VkDataType::FLOAT16:
+      return api::kHalf;
+    case vkgraph::VkDataType::FLOAT32:
+      return api::kFloat;
+  }
+}
+
+class GraphBuilder {
+  ComputeGraph* compute_graph_;
+  VkGraphPtr flatbuffer_;
+  const uint8_t* constant_data_;
+
+  std::unordered_map<uint32_t, ValueRef> ref_mapping_;
 
-class VulkanBackend final : public PyTorchBackendInterface {
  public:
-  ~VulkanBackend() override = default;
+  explicit GraphBuilder(
+      ComputeGraph* compute_graph,
+      VkGraphPtr flatbuffer,
+      const uint8_t* constant_data)
+      : compute_graph_(compute_graph),
+        flatbuffer_(flatbuffer),
+        constant_data_(constant_data),
+        ref_mapping_() {}
+
+  bool fb_id_exists(const uint32_t fb_id) {
+    const std::unordered_map<uint32_t, ValueRef>::iterator found_ref =
+        ref_mapping_.find(fb_id);
 
-  bool is_available() const override {
-    return true;
+    return found_ref != ref_mapping_.end();
   }
 
-  api::ScalarType get_scalar_type(
-      const vkgraph::VkDataType& vk_datatype) const {
-    switch (vk_datatype) {
-      case (vkgraph::VkDataType::fp32): {
-        return api::kFloat;
-      }
-    }
+  ValueRef get_fb_id_valueref(const uint32_t fb_id) {
+    const std::unordered_map<uint32_t, ValueRef>::iterator found_ref =
+        ref_mapping_.find(fb_id);
+
+    ET_CHECK_MSG(
+        found_ref != ref_mapping_.end(),
+        "Trying to extract a value that hasn't yet been added to the graph.");
+
+    return found_ref->second;
   }
 
-  ValueRef get_value_ref(
-      const uint32_t value_id,
-      VkGraphPtr flatbuffer_graph,
-      ComputeGraph* compute_graph,
-      std::unordered_map<uint32_t, ValueRef>& ref_mapping,
-      VkValuesVector value_mapping,
-      const uint8_t* constant_data) const {
-    const std::unordered_map<uint32_t, ValueRef>::iterator found_ref =
-        ref_mapping.find(value_id);
+  void add_tensor_to_graph(const uint32_t fb_id, VkTensorPtr tensor_fb) {
+    const api::ScalarType& dtype = get_scalar_type(tensor_fb->datatype());
+
+    UIntVector dims_fb = tensor_fb->dims();
+    const std::vector<int64_t> dims_vector(dims_fb->cbegin(), dims_fb->cend());
+
+    ValueRef ref;
+    if (tensor_fb->constant_id() >= 0) {
+      const uint8_t* tensor_data = getConstantDataPtr(
+          flatbuffer_, tensor_fb->constant_id(), constant_data_);
 
-    if (found_ref != ref_mapping.end()) {
-      return found_ref->second;
+      ref = compute_graph_->add_tensorref(dims_vector, dtype, tensor_data);
+    } else {
+      ref = compute_graph_->add_tensor(
+          dims_vector, dtype, tensor_fb->mem_obj_id());
     }
 
-    VkValuePtr vk_value = value_mapping->Get(value_id);
-    VkTensorPtr vk_tensor = vk_value->value();
+    ref_mapping_[fb_id] = ref;
+  }
+
+  template <typename T>
+  typename std::enable_if<is_valid_scalar_type<T>::value, void>::type
+  add_scalar_to_graph(const uint32_t fb_id, T value) {
+    ValueRef ref = compute_graph_->add_scalar(value);
+    ref_mapping_[fb_id] = ref;
+  }
+
+  template <typename T>
+  typename std::enable_if<is_valid_scalar_type<T>::value, void>::type
+  add_scalar_list_to_graph(const uint32_t fb_id, std::vector<T>&& value) {
+    ValueRef ref = compute_graph_->add_scalar_list(std::move(value));
+    ref_mapping_[fb_id] = ref;
+  }
+
+  void add_value_list_to_graph(
+      const uint32_t fb_id,
+      std::vector<ValueRef>&& value) {
+    ValueRef ref = compute_graph_->add_value_list(std::move(value));
+    ref_mapping_[fb_id] = ref;
+  }
+
+  void add_string_to_graph(const uint32_t fb_id, VkValuePtr value) {
+    const auto fb_str = value->value_as_String()->string_val();
+    std::string string(fb_str->cbegin(), fb_str->cend());
+    ValueRef ref = compute_graph_->add_string(std::move(string));
+    ref_mapping_[fb_id] = ref;
+  }
 
+  void add_value_to_graph(const uint32_t fb_id, VkValuePtr value) {
     ET_CHECK_MSG(
-        vk_tensor->constant_id() >= 0,
-        "Only constant buffers are supported when adding tensors to compute graph (indicated by constant_id < 0), but got constant_id of %d",
-        vk_tensor->constant_id());
+        !fb_id_exists(fb_id),
+        "Trying to add a value that has already been added to the graph.");
+
+    switch (value->value_type()) {
+      case vkgraph::GraphTypes::Int:
+        add_scalar_to_graph(fb_id, value->value_as_Int()->int_val());
+        break;
+      case vkgraph::GraphTypes::Double:
+        add_scalar_to_graph(fb_id, value->value_as_Double()->double_val());
+        break;
+      case vkgraph::GraphTypes::Bool:
+        add_scalar_to_graph(fb_id, value->value_as_Bool()->bool_val());
+        break;
+      case vkgraph::GraphTypes::VkTensor:
+        add_tensor_to_graph(fb_id, value->value_as_VkTensor());
+        break;
+      case vkgraph::GraphTypes::IntList:
+        add_scalar_list_to_graph(
+            fb_id,
+            std::vector<int64_t>(
+                value->value_as_IntList()->items()->cbegin(),
+                value->value_as_IntList()->items()->cend()));
+        break;
+      case vkgraph::GraphTypes::DoubleList:
+        add_scalar_list_to_graph(
+            fb_id,
+            std::vector<double>(
+                value->value_as_DoubleList()->items()->cbegin(),
+                value->value_as_DoubleList()->items()->cend()));
+        break;
+      case vkgraph::GraphTypes::BoolList:
+        add_scalar_list_to_graph(
+            fb_id,
+            std::vector<bool>(
+                value->value_as_BoolList()->items()->cbegin(),
+                value->value_as_BoolList()->items()->cend()));
+        break;
+      case vkgraph::GraphTypes::ValueList:
+        add_value_list_to_graph(
+            fb_id,
+            std::vector<ValueRef>(
+                value->value_as_ValueList()->items()->cbegin(),
+                value->value_as_ValueList()->items()->cend()));
+        break;
+      case vkgraph::GraphTypes::String:
+        add_string_to_graph(fb_id, value);
+        break;
+      default:
+        ET_CHECK_MSG(false, "Unsupported value type.");
+    }
+  }
 
-    const api::ScalarType& tensor_dtype =
-        get_scalar_type(vk_tensor->datatype());
+  void build_graph() {
+    // First, add all values to the graph
+    for (uint32_t fb_id = 0; fb_id < flatbuffer_->values()->size(); ++fb_id) {
+      VkValuePtr value = flatbuffer_->values()->Get(fb_id);
+      add_value_to_graph(fb_id, value);
+    }
 
-    UIntVector tensor_dims_fb = vk_tensor->dims();
-    const std::vector<int64_t> tensor_dims_vector(
-        tensor_dims_fb->cbegin(), tensor_dims_fb->cend());
+    // Parse the inputs
+    for (const uint32_t fb_id : *flatbuffer_->input_ids()) {
+      const ValueRef ref = get_fb_id_valueref(fb_id);
+      compute_graph_->set_input_tensor(ref);
+    }
 
-    const uint8_t* tensor_data = getConstantDataPtr(
-        flatbuffer_graph, vk_tensor->constant_id(), constant_data);
+    // Parse the operators
+    for (OpCallPtr op_call : *(flatbuffer_->chain())) {
+      std::string op_name = op_call->name()->str();
+      ET_CHECK_MSG(VK_HAS_OP(op_name), "Missing operator: %s", op_name.c_str());
 
-    const ValueRef value_ref = compute_graph->add_tensorref(
-        tensor_dims_vector, tensor_dtype, tensor_data);
+      const std::vector<int> arg_fb_ids(
+          op_call->args()->cbegin(), op_call->args()->cend());
 
-    ref_mapping[value_id] = value_ref;
+      std::vector<ValueRef> args;
+      for (const int arg_fb_id : arg_fb_ids) {
+        args.push_back(get_fb_id_valueref(arg_fb_id));
+      }
 
-    return value_ref;
+      auto vkFn = VK_GET_OP_FN(op_name);
+      vkFn(*compute_graph_, args);
+    }
+
+    // Parse the outputs
+    for (const uint32_t fb_id : *flatbuffer_->output_ids()) {
+      const ValueRef ref = get_fb_id_valueref(fb_id);
+      compute_graph_->set_output_tensor(ref);
+    }
+  }
+};
+
+//
+// Execution tools
+//
+
+bool maybe_resize_input(
+    ComputeGraph* graph,
+    const size_t input_i,
+    exec_aten::Tensor& et_tensor) {
+  ValueRef in_tensor_ref = graph->inputs()[input_i].value;
+  vTensor& in_tensor = graph->get_val(in_tensor_ref).toTensor();
+
+  ET_CHECK_MSG(
+      et_tensor.dim() == in_tensor.sizes().size(),
+      "Cannot resize input tensor: old ndim %zu does not match new ndim %zu",
+      static_cast<size_t>(in_tensor.sizes().size()),
+      static_cast<size_t>(et_tensor.dim()));
+
+  bool should_resize = false;
+  std::vector<int64_t> new_sizes(et_tensor.dim());
+  for (size_t i = 0; i < et_tensor.dim(); i++) {
+    if (in_tensor.sizes()[i] != et_tensor.sizes()[i]) {
+      should_resize = true;
+    }
+    new_sizes.at(i) = et_tensor.sizes()[i];
+  }
+
+  if (should_resize) {
+    graph->resize_input(input_i, new_sizes);
   }
 
-  GraphConfig generate_config() const {
-    const uint32_t submit_frequency = UINT32_MAX;
-
-    const api::CommandPoolConfig cmd_config{
-        4u, // cmdPoolInitialSize
-        2u, // cmdPoolBatchSize
-    };
-
-    const api::DescriptorPoolConfig descriptor_pool_config{
-        1024u, // descriptorPoolMaxSets
-        1024u, // descriptorUniformBufferCount
-        1024u, // descriptorStorageBufferCount
-        1024u, // descriptorCombinedSamplerCount
-        1024u, // descriptorStorageImageCount
-        32u, // descriptorPileSizes
-    };
-
-    const api::QueryPoolConfig query_pool_config{};
-
-    const api::ContextConfig context_config{
-        submit_frequency, // cmdSubmitFrequency
-        cmd_config, // cmdPoolConfig
-        descriptor_pool_config, // descriptorPoolConfig
-        query_pool_config, // queryPoolConfig
-    };
-
-    const GraphConfig graph_config{
-        context_config,
-    };
-
-    return graph_config;
+  ET_CHECK_MSG(
+      in_tensor.numel() == et_tensor.numel(),
+      "Vulkan tensor numel %zu does not match ET tensor numel %zu",
+      static_cast<size_t>(in_tensor.numel()),
+      static_cast<size_t>(et_tensor.numel()));
+
+  return should_resize;
+}
+
+void maybe_resize_output(
+    ComputeGraph* graph,
+    const size_t output_i,
+    exec_aten::Tensor& et_tensor) {
+  ValueRef out_tensor_ref = graph->outputs()[output_i].value;
+  vTensor& out_tensor = graph->get_val(out_tensor_ref).toTensor();
+
+  exec_aten::SizesType new_output_size[kTensorDimensionLimit];
+  size_t ndim = out_tensor.sizes().size();
+  for (int i = 0; i < ndim; ++i) {
+    new_output_size[i] = out_tensor.sizes()[i];
+  }
+
+  exec_aten::ArrayRef<exec_aten::SizesType> output_size{new_output_size, ndim};
+  Error err = resize_tensor(et_tensor, output_size);
+
+  ET_CHECK_MSG(err == Error::Ok, "Failed to resize output tensor.");
+}
+
+//
+// VulkanBackend class
+//
+
+class VulkanBackend final : public PyTorchBackendInterface {
+ public:
+  ~VulkanBackend() override = default;
+
+  bool is_available() const override {
+    // TODO(ssjia): replace with an actual Vulkan runtime availability check
+    return true;
   }
 
   __ET_NODISCARD Error
   compileModel(const void* buffer_pointer, ComputeGraph* compute_graph) const {
     Result<VulkanDelegateHeader> header =
         VulkanDelegateHeader::Parse(buffer_pointer);
+
     const uint8_t* flatbuffer_data = nullptr;
     const uint8_t* constant_data = nullptr;
 
@@ -169,92 +345,12 @@ class VulkanBackend final : public PyTorchBackendInterface {
 
     VkGraphPtr flatbuffer_graph = vkgraph::GetVkGraph(flatbuffer_data);
 
-    // Mapping from serialized VkValue ids to compute graph ValueRefs
-    // This will be populated as the compute graph is built
-    std::unordered_map<uint32_t, ValueRef> ref_mapping;
-
-    // A vector which acts as a mapping from VkValue ids (vector indices) to
-    // VkValues
-    VkValuesVector value_mapping = flatbuffer_graph->values();
-
-    // 1. Add all inputs (and corresponding tensors) to the compute graph
-    UIntVector input_ids = flatbuffer_graph->input_ids();
-
-    for (size_t input_index = 0; input_index < input_ids->size();
-         ++input_index) {
-      const uint32_t input_id = input_ids->Get(input_index);
-      VkValuePtr input_vk_value = value_mapping->Get(input_id);
-
-      VkTensorPtr input_vk_tensor = input_vk_value->value();
-
-      ET_CHECK_MSG(
-          input_vk_tensor->constant_id() < 0,
-          "Expected constant buffer index for input at index %zu with id %d to be < 0 (since it is non-constant), but got: %d",
-          input_index,
-          input_id,
-          input_vk_tensor->constant_id());
-
-      const api::ScalarType& input_dtype =
-          get_scalar_type(input_vk_tensor->datatype());
+    GraphBuilder builder =
+        GraphBuilder(compute_graph, flatbuffer_graph, constant_data);
 
-      UIntVector input_dims_fb = input_vk_tensor->dims();
-      const std::vector<int64_t> input_dims_vector(
-          input_dims_fb->cbegin(), input_dims_fb->cend());
+    builder.build_graph();
 
-      const ValueRef input_ref = compute_graph->add_tensor(
-          input_dims_vector, input_dtype, input_vk_tensor->mem_obj_id());
-
-      ref_mapping[input_id] = input_ref;
-      compute_graph->set_input_tensor(input_ref);
-    }
-
-    // 2. Add all ops to the graph
-    // TODO: Generalize for ops that don't have 2 inputs and 1 output.
-    for (OpCallPtr op_call : *(flatbuffer_graph->chain())) {
-      std::string op_name = op_call->name()->str();
-
-      ET_CHECK_MSG(
-          op_call->args() != nullptr && op_call->args()->size() == 3,
-          "Vulkan currently only supports OperatorCall with 3 args");
-      const auto arg_ids = op_call->args()->data();
-
-      const uint32_t input1_id = arg_ids[0];
-      const uint32_t input2_id = arg_ids[1];
-      const uint32_t output_id = arg_ids[2];
-
-      const ValueRef input1_ref = get_value_ref(
-          input1_id,
-          flatbuffer_graph,
-          compute_graph,
-          ref_mapping,
-          value_mapping,
-          constant_data);
-
-      const ValueRef input2_ref = get_value_ref(
-          input2_id,
-          flatbuffer_graph,
-          compute_graph,
-          ref_mapping,
-          value_mapping,
-          constant_data);
-
-      ET_CHECK_MSG(hasOpsFn(op_name), "Missing operator: %s", op_name.c_str());
-      auto vkFn = getOpsFn(op_name);
-      const at::native::vulkan::ValueRef output_ref = vkFn(
-          *compute_graph,
-          {input1_ref,
-           input2_ref,
-           1,
-           value_mapping->Get(output_id)->value()->mem_obj_id()});
-
-      ref_mapping[output_id] = output_ref;
-    }
-
-    // 3. Add all outputs to the compute graph
-    for (const uint32_t output_id : *flatbuffer_graph->output_ids()) {
-      const ValueRef output_ref = ref_mapping[output_id];
-      compute_graph->set_output_tensor(output_ref);
-    }
+    compute_graph->prepare();
 
     compute_graph->encode_prepack();
     compute_graph->prepack();
@@ -271,7 +367,7 @@ class VulkanBackend final : public PyTorchBackendInterface {
     ComputeGraph* compute_graph = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
         context.get_runtime_allocator(), ComputeGraph);
 
-    new (compute_graph) ComputeGraph(generate_config());
+    new (compute_graph) ComputeGraph(GraphConfig());
 
     Error err = compileModel(processed->data(), compute_graph);
 
@@ -291,20 +387,28 @@ class VulkanBackend final : public PyTorchBackendInterface {
     ComputeGraph* compute_graph = static_cast<ComputeGraph*>(handle);
 
     const size_t num_inputs = compute_graph->inputs().size();
+    bool should_propagate_resize = false;
     for (size_t i = 0; i < num_inputs; i++) {
+      bool was_resized =
+          maybe_resize_input(compute_graph, i, args[i]->toTensor());
+      should_propagate_resize = should_propagate_resize || was_resized;
       compute_graph->copy_into_staging(
-          compute_graph->inputs()[i],
+          compute_graph->inputs()[i].staging,
           args[i]->toTensor().const_data_ptr(),
           args[i]->toTensor().numel());
     }
 
+    if (should_propagate_resize) {
+      compute_graph->propagate_resize();
+    }
     compute_graph->execute();
 
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
+      maybe_resize_output(compute_graph, i, args[num_inputs + i]->toTensor());
       // args holds inputs directly followed by outputs, so the i'th output
       // for compute_graph corresponds to the (i + num_inputs)'th arg
       compute_graph->copy_from_staging(
-          compute_graph->outputs()[i],
+          compute_graph->outputs()[i].staging,
           args[num_inputs + i]->toTensor().mutable_data_ptr(),
           args[num_inputs + i]->toTensor().numel());
     }
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 5adb5691e3..d51dc99689 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -6,16 +6,23 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+// @lint-ignore-every CLANGTIDY
+// facebook-security-vulnerable-integer-sign-conversion
+
 #include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+
 namespace at {
 namespace native {
 namespace vulkan {
 
 ComputeGraph::ComputeGraph(GraphConfig config)
     : config_{config},
+      prepack_descriptor_counts_{},
+      execute_descriptor_counts_{},
       context_{new api::Context(
           api::runtime()->default_adapter_i(),
           config_.contextConfig)},
@@ -25,6 +32,19 @@ ComputeGraph::ComputeGraph(GraphConfig config)
       execute_nodes_{},
       inputs_{},
       outputs_{} {
+  // Ensure that descriptor counts are initialized to 0
+  prepack_descriptor_counts_.descriptorPoolMaxSets = 0;
+  prepack_descriptor_counts_.descriptorUniformBufferCount = 0;
+  prepack_descriptor_counts_.descriptorStorageBufferCount = 0;
+  prepack_descriptor_counts_.descriptorCombinedSamplerCount = 0;
+  prepack_descriptor_counts_.descriptorStorageImageCount = 0;
+
+  execute_descriptor_counts_.descriptorPoolMaxSets = 0;
+  execute_descriptor_counts_.descriptorUniformBufferCount = 0;
+  execute_descriptor_counts_.descriptorStorageBufferCount = 0;
+  execute_descriptor_counts_.descriptorCombinedSamplerCount = 0;
+  execute_descriptor_counts_.descriptorStorageImageCount = 0;
+
   context_->set_cmd(/*reusable = */ true);
 }
 
@@ -37,6 +57,33 @@ ComputeGraph::~ComputeGraph() {
   context_->flush();
 }
 
+void ComputeGraph::update_descriptor_counts(
+    const api::ShaderInfo& shader_info,
+    bool execute) {
+  api::DescriptorPoolConfig* config =
+      execute ? &execute_descriptor_counts_ : &prepack_descriptor_counts_;
+
+  config->descriptorPoolMaxSets += 1;
+  for (const VkDescriptorType arg_type : shader_info.kernel_layout) {
+    switch (arg_type) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+        config->descriptorUniformBufferCount += 1;
+        break;
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+        config->descriptorStorageBufferCount += 1;
+        break;
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+        config->descriptorCombinedSamplerCount += 1;
+        break;
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+        config->descriptorStorageImageCount += 1;
+        break;
+      default:
+        VK_THROW("Unsupported descriptor type!");
+    }
+  }
+}
+
 ValueRef ComputeGraph::add_tensor(
     const std::vector<int64_t>& sizes,
     const api::ScalarType dtype,
@@ -75,17 +122,29 @@ ValueRef ComputeGraph::add_staging(
   return idx;
 }
 
+ValueRef ComputeGraph::add_value_list(std::vector<ValueRef>&& value) {
+  ValueRef idx(static_cast<int>(values_.size()));
+  values_.emplace_back(std::move(value));
+  return idx;
+}
+
+ValueRef ComputeGraph::add_string(std::string&& str) {
+  ValueRef idx(static_cast<int>(values_.size()));
+  values_.emplace_back(std::move(str));
+  return idx;
+}
+
 ValueRef ComputeGraph::set_input_tensor(
     const ValueRef idx,
     const bool use_staging) {
   if (use_staging) {
     vTensor& tensor = get_val(idx).toTensor();
     ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
-    execute_nodes_.emplace_back(new StagingNode(staging_idx, idx));
-    inputs_.push_back(staging_idx);
+    add_staging_to_tensor_node(*this, staging_idx, idx);
+    inputs_.push_back({idx, staging_idx});
     return staging_idx;
   }
-  inputs_.push_back(idx);
+  inputs_.push_back({idx, kDummyValueRef});
   return idx;
 }
 
@@ -95,11 +154,11 @@ ValueRef ComputeGraph::set_output_tensor(
   if (use_staging) {
     vTensor& tensor = get_val(idx).toTensor();
     ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
-    execute_nodes_.emplace_back(new StagingNode(idx, staging_idx));
-    outputs_.push_back(staging_idx);
+    add_tensor_to_staging_node(*this, idx, staging_idx);
+    outputs_.push_back({idx, staging_idx});
     return staging_idx;
   }
-  outputs_.push_back(idx);
+  outputs_.push_back({idx, kDummyValueRef});
   return idx;
 }
 
@@ -130,6 +189,30 @@ void ComputeGraph::copy_from_staging(
   copy_staging_to_ptr(staging, data, nbytes);
 }
 
+void ComputeGraph::prepare() {
+#define MERGE_FIELD(field)                    \
+  static_cast<uint32_t>(std::ceil(            \
+      std::max(                               \
+          execute_descriptor_counts_.field,   \
+          prepack_descriptor_counts_.field) * \
+      config_.descriptorPoolSafetyFactor))
+
+  uint32_t max_sets = MERGE_FIELD(descriptorPoolMaxSets);
+  api::DescriptorPoolConfig config{
+      max_sets,
+      std::max(MERGE_FIELD(descriptorUniformBufferCount), max_sets),
+      std::max(MERGE_FIELD(descriptorStorageBufferCount), max_sets),
+      std::max(MERGE_FIELD(descriptorCombinedSamplerCount), max_sets),
+      std::max(MERGE_FIELD(descriptorStorageImageCount), max_sets),
+      1u,
+  };
+
+  if (!context_->descriptor_pool()) {
+    context_->descriptor_pool().init(config);
+  }
+#undef MERGE_FIELD
+}
+
 void ComputeGraph::encode_prepack() {
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     node->encode(this);
@@ -165,6 +248,19 @@ void ComputeGraph::execute() const {
   fence.wait();
 }
 
+void ComputeGraph::resize_input(
+    const int64_t idx,
+    const std::vector<int64_t>& new_sizes) {
+  IOValueRef io_val = inputs_.at(idx);
+  get_val(io_val.value).toTensor().virtual_resize(new_sizes);
+}
+
+void ComputeGraph::propagate_resize() {
+  for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
+    node->trigger_resize(this);
+  }
+}
+
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index ec8d3ba1db..b5b2749dfb 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -12,9 +12,7 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Context.h>
-#include <ATen/native/vulkan/api/Tensor.h>
-#include <ATen/native/vulkan/api/Types.h>
+#include <ATen/native/vulkan/api/api.h>
 
 #include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>
 
@@ -28,6 +26,19 @@ namespace at {
 namespace native {
 namespace vulkan {
 
+// Define valid scalar types that the Value class can accept
+template <typename T>
+struct is_valid_scalar_type : std::false_type {};
+
+template <>
+struct is_valid_scalar_type<int64_t> : std::true_type {};
+
+template <>
+struct is_valid_scalar_type<double> : std::true_type {};
+
+template <>
+struct is_valid_scalar_type<bool> : std::true_type {};
+
 /*
  * This is the core data structure used to execute Vulkan models in graph mode.
  * As opposed to ATen/eager mode where a command buffer is encoded every
@@ -47,6 +58,9 @@ class ComputeGraph final {
 
  private:
   GraphConfig config_;
+  api::DescriptorPoolConfig prepack_descriptor_counts_;
+  api::DescriptorPoolConfig execute_descriptor_counts_;
+
   std::unique_ptr<api::Context> context_;
   std::vector<SharedObject> shared_objects_;
   std::vector<Value> values_;
@@ -54,8 +68,8 @@ class ComputeGraph final {
   std::vector<std::unique_ptr<PrepackNode>> prepack_nodes_;
   std::vector<std::unique_ptr<ExecuteNode>> execute_nodes_;
 
-  std::vector<ValueRef> inputs_;
-  std::vector<ValueRef> outputs_;
+  std::vector<IOValueRef> inputs_;
+  std::vector<IOValueRef> outputs_;
 
  public:
   //
@@ -66,14 +80,18 @@ class ComputeGraph final {
     return context_.get();
   }
 
-  inline std::vector<ValueRef>& inputs() {
+  inline std::vector<IOValueRef>& inputs() {
     return inputs_;
   }
 
-  inline std::vector<ValueRef>& outputs() {
+  inline std::vector<IOValueRef>& outputs() {
     return outputs_;
   }
 
+  void update_descriptor_counts(
+      const api::ShaderInfo& shader_info,
+      bool execute);
+
   /*
    * Returns the value at a particular reference
    */
@@ -123,9 +141,27 @@ class ComputeGraph final {
       const void* const data);
   ValueRef add_staging(const api::ScalarType dtype, const size_t numel);
 
+  template <typename T>
+  typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
+  add_scalar(T value);
+
+  template <typename T>
+  typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
+  add_scalar_list(std::vector<T>&& value);
+
+  ValueRef add_value_list(std::vector<ValueRef>&& value);
+
+  ValueRef add_string(std::string&& str);
+
   ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
   ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
 
+  template <typename Block>
+  inline std::shared_ptr<api::UniformParamsBuffer> create_params_buffer(
+      const Block& data) {
+    return std::make_shared<api::UniformParamsBuffer>(context_.get(), data);
+  }
+
   /*
    * Convenience function to add an input tensor along with its staging buffer
    */
@@ -140,6 +176,12 @@ class ComputeGraph final {
 
   SharedObject& get_shared_object(const int64_t idx);
 
+  //
+  // Graph Preparation
+  //
+
+  void prepare();
+
   //
   // Input/Output
   //
@@ -161,8 +203,31 @@ class ComputeGraph final {
 
   void encode_execute();
   void execute() const;
+
+  //
+  // Dynamic Shape support
+  //
+
+  void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
+  void propagate_resize();
 };
 
+template <typename T>
+inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
+ComputeGraph::add_scalar(T value) {
+  ValueRef idx(static_cast<int>(values_.size()));
+  values_.emplace_back(value);
+  return idx;
+}
+
+template <typename T>
+inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
+ComputeGraph::add_scalar_list(std::vector<T>&& value) {
+  ValueRef idx(static_cast<int>(values_.size()));
+  values_.emplace_back(std::move(value));
+  return idx;
+}
+
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp
new file mode 100644
index 0000000000..8cda518dae
--- /dev/null
+++ b/backends/vulkan/runtime/graph/GraphConfig.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+GraphConfig::GraphConfig() {
+  // No automatic submissions
+  const uint32_t submit_frequency = UINT32_MAX;
+
+  // Only one command buffer will be encoded at a time
+  const api::CommandPoolConfig cmd_config{
+      1u, // cmdPoolInitialSize
+      1u, // cmdPoolBatchSize
+  };
+
+  // Use lazy descriptor pool initialization by default; the graph runtime will
+  // tally up the number of descriptor sets needed while building the graph and
+  // trigger descriptor pool initialization with exact sizes before encoding the
+  // command buffer.
+  const api::DescriptorPoolConfig descriptor_pool_config{
+      0u, // descriptorPoolMaxSets
+      0u, // descriptorUniformBufferCount
+      0u, // descriptorStorageBufferCount
+      0u, // descriptorCombinedSamplerCount
+      0u, // descriptorStorageImageCount
+      0u, // descriptorPileSizes
+  };
+
+  const api::QueryPoolConfig query_pool_config{};
+
+  const api::ContextConfig context_config{
+      submit_frequency, // cmdSubmitFrequency
+      cmd_config, // cmdPoolConfig
+      descriptor_pool_config, // descriptorPoolConfig
+      query_pool_config, // queryPoolConfig
+  };
+
+  contextConfig = context_config;
+
+  // Empirically selected safety factor. If descriptor pools start running out
+  // of memory, increase this safety factor.
+  descriptorPoolSafetyFactor = 1.25;
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
index 0cb9bb6f53..ce0f0839a9 100644
--- a/backends/vulkan/runtime/graph/GraphConfig.h
+++ b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -10,7 +10,7 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Context.h>
+#include <ATen/native/vulkan/api/api.h>
 
 namespace at {
 namespace native {
@@ -18,6 +18,16 @@ namespace vulkan {
 
 struct GraphConfig final {
   api::ContextConfig contextConfig;
+
+  // Creating a descriptor pool with exactly the number of descriptors tallied
+  // by iterating through the shader layouts of shaders used in the graph risks
+  // the descriptor pool running out of memory, therefore apply a safety factor
+  // to descriptor counts when creating the descriptor pool to mitigate this
+  // risk.
+  float descriptorPoolSafetyFactor;
+
+  // Generate a default graph config with pre-configured settings
+  explicit GraphConfig();
 };
 
 } // namespace vulkan
diff --git a/backends/vulkan/runtime/graph/containers/Types.cpp b/backends/vulkan/runtime/graph/containers/Types.cpp
index bbfde572b0..0779ed8716 100644
--- a/backends/vulkan/runtime/graph/containers/Types.cpp
+++ b/backends/vulkan/runtime/graph/containers/Types.cpp
@@ -12,20 +12,25 @@ namespace at {
 namespace native {
 namespace vulkan {
 
+#define PRINT_CASE(name) \
+  case TypeTag::name:    \
+    out << #name;        \
+    break;
+
 std::ostream& operator<<(std::ostream& out, const TypeTag& tag) {
   switch (tag) {
-    case TypeTag::NONE:
-      out << "NONE";
-      break;
-    case TypeTag::TENSOR:
-      out << "TENSOR";
-      break;
-    case TypeTag::STAGING:
-      out << "STAGING";
-      break;
-    default:
-      out << "UNKNOWN";
-      break;
+    PRINT_CASE(NONE)
+    PRINT_CASE(INT)
+    PRINT_CASE(DOUBLE)
+    PRINT_CASE(BOOL)
+    PRINT_CASE(TENSOR)
+    PRINT_CASE(STAGING)
+    PRINT_CASE(TENSORREF)
+    PRINT_CASE(INTLIST)
+    PRINT_CASE(DOUBLELIST)
+    PRINT_CASE(BOOLLIST)
+    PRINT_CASE(VALUELIST)
+    PRINT_CASE(STRING)
   }
   return out;
 }
diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h
index a7162d777a..d5dee7ea0d 100644
--- a/backends/vulkan/runtime/graph/containers/Types.h
+++ b/backends/vulkan/runtime/graph/containers/Types.h
@@ -23,12 +23,21 @@ namespace vulkan {
  */
 enum class TypeTag : uint32_t {
   NONE,
-  TENSOR,
-  STAGING,
-  TENSORREF,
+  // Scalar types
   INT,
   DOUBLE,
   BOOL,
+  // Tensor and tensor adjacent types
+  TENSOR,
+  STAGING,
+  TENSORREF,
+  // Scalar lists
+  INTLIST,
+  DOUBLELIST,
+  BOOLLIST,
+  // Special Type
+  VALUELIST,
+  STRING,
 };
 
 std::ostream& operator<<(std::ostream& out, const TypeTag& tag);
diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
index d56791b4fa..82ba941713 100644
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ b/backends/vulkan/runtime/graph/containers/Value.h
@@ -22,6 +22,19 @@ namespace at {
 namespace native {
 namespace vulkan {
 
+using ValueRef = int32_t;
+
+constexpr ValueRef kDummyValueRef = -1;
+
+inline bool is_valid(ValueRef value_ref) {
+  return value_ref >= 0;
+}
+
+struct IOValueRef {
+  ValueRef value;
+  ValueRef staging;
+};
+
 /*
  * This class is modelled after c10::IValue; however, it is simplified and does
  * not support as many types. However, the core design is the same; it is a
@@ -48,6 +61,17 @@ struct Value final {
     api::StorageBuffer as_staging;
     TensorRef as_tensorref;
 
+    std::vector<int64_t> as_int_list;
+    std::vector<double> as_double_list;
+    std::vector<bool> as_bool_list;
+
+    // The below is a special type that is used to represent a list of other
+    // values stored in the graph. One application of the type is to represent
+    // a list of tensors or a list of optional tensors.
+    std::vector<ValueRef> as_value_list;
+
+    std::string as_string;
+
     Payload() : u() {}
     // NOLINTNEXTLINE
     ~Payload(){};
@@ -68,21 +92,48 @@ struct Value final {
 
   Value& operator=(Value&&) = delete;
 
+#define CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(type_tag, member_name) \
+  case type_tag:                                                 \
+    payload.u.member_name = rhs.payload.u.member_name;           \
+    break;
+
+#define CASE_MOVE_MOVEABLE_TYPE(type_tag, type, member_name)             \
+  case type_tag:                                                         \
+    new (&payload.member_name) type(std::move(rhs.payload.member_name)); \
+    break;
+
   Value(Value&& rhs) noexcept : tag(rhs.tag) {
-    if (rhs.isTensor()) {
-      new (&payload.as_tensor) vTensor(std::move(rhs.payload.as_tensor));
-    } else if (rhs.isStaging()) {
-      new (&payload.as_staging)
-          api::StorageBuffer(std::move(rhs.payload.as_staging));
-    } else if (rhs.isTensorRef()) {
-      payload.as_tensorref = std::move(rhs.payload.as_tensorref);
-    } else {
-      payload.u = rhs.payload.u;
+    switch (tag) {
+      // Scalar types
+      CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::INT, as_int);
+      CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::DOUBLE, as_double);
+      CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::BOOL, as_bool);
+      // Tensor and tensor adjacent types
+      CASE_MOVE_MOVEABLE_TYPE(TypeTag::TENSOR, vTensor, as_tensor);
+      CASE_MOVE_MOVEABLE_TYPE(TypeTag::STAGING, api::StorageBuffer, as_staging);
+      CASE_MOVE_MOVEABLE_TYPE(TypeTag::TENSORREF, TensorRef, as_tensorref);
+      // Scalar lists
+      CASE_MOVE_MOVEABLE_TYPE(
+          TypeTag::INTLIST, std::vector<int64_t>, as_int_list);
+      CASE_MOVE_MOVEABLE_TYPE(
+          TypeTag::DOUBLELIST, std::vector<double>, as_double_list);
+      CASE_MOVE_MOVEABLE_TYPE(
+          TypeTag::BOOLLIST, std::vector<bool>, as_bool_list);
+      // Special types
+      CASE_MOVE_MOVEABLE_TYPE(
+          TypeTag::VALUELIST, std::vector<ValueRef>, as_value_list);
+      CASE_MOVE_MOVEABLE_TYPE(TypeTag::STRING, std::string, as_string);
+
+      case TypeTag::NONE:
+        clearToNone();
+        break;
     }
-    tag = rhs.tag;
     rhs.clearToNone();
   }
 
+#undef CASE_MOVE_TRIVIALLY_COPYABLE_TYPE
+#undef CASE_MOVE_MOVEABLE_TYPE
+
   //
   // Accessors
   //
@@ -96,77 +147,127 @@ struct Value final {
   //
 
   ~Value() {
-    if (this->isTensor()) {
-      payload.as_tensor.~vTensor();
-    } else if (this->isStaging()) {
-      payload.as_staging.~StorageBuffer();
-    } else if (this->isTensorRef()) {
-      payload.as_tensorref.~TensorRef();
+    switch (tag) {
+      case TypeTag::TENSOR:
+        payload.as_tensor.~vTensor();
+        break;
+      case TypeTag::STAGING:
+        payload.as_staging.~StorageBuffer();
+        break;
+      case TypeTag::TENSORREF:
+        payload.as_tensorref.~TensorRef();
+        break;
+      case TypeTag::INTLIST:
+        payload.as_int_list.~vector();
+        break;
+      case TypeTag::DOUBLELIST:
+        payload.as_double_list.~vector();
+        break;
+      case TypeTag::BOOLLIST:
+        payload.as_bool_list.~vector();
+        break;
+      case TypeTag::VALUELIST:
+        payload.as_value_list.~vector();
+        break;
+      case TypeTag::STRING:
+        payload.as_string.~basic_string();
+        break;
+      // Manually list out the types so that if a type here is added later and
+      // not handled the compiler can catch it.
+      case TypeTag::NONE:
+      case TypeTag::INT:
+      case TypeTag::DOUBLE:
+      case TypeTag::BOOL:
+        break;
     }
   }
 
-  //
-  // Tensor
-  //
-
-  explicit Value(vTensor&& t) : tag(TypeTag::TENSOR) {
-    new (&payload.as_tensor) vTensor(std::move(t));
-  }
-
-  inline bool isTensor() const {
-    return TypeTag::TENSOR == tag;
-  }
-
-  inline vTensor& toTensor() {
-    VK_CHECK_COND(
-        isTensor(),
-        "Expected value to have type TENSOR, got ",
-        tag,
-        " instead.");
-    return payload.as_tensor;
+#define SUPPORT_TRIVIALLY_COPYABLE_TYPE(                    \
+    type, type_name, type_tag, member_name)                 \
+  explicit Value(type t) : tag(type_tag) {                  \
+    payload.u.member_name = t;                              \
+  }                                                         \
+  inline bool is##type_name() const {                       \
+    return tag == type_tag;                                 \
+  }                                                         \
+  inline const type& to##type_name() const {                \
+    VK_CHECK_COND(                                          \
+        is##type_name(),                                    \
+        "Expected value to have type " #type_name ", got ", \
+        tag,                                                \
+        " instead.");                                       \
+    return payload.u.member_name;                           \
   }
 
-  //
-  // Staging
-  //
-
-  explicit Value(api::StorageBuffer&& t) : tag(TypeTag::STAGING) {
-    new (&payload.as_staging) api::StorageBuffer(std::move(t));
-  }
-
-  inline bool isStaging() const {
-    return TypeTag::STAGING == tag;
-  }
-
-  inline api::StorageBuffer& toStaging() {
-    VK_CHECK_COND(
-        isStaging(),
-        "Expected value to have type STAGING, got ",
-        tag,
-        " instead.");
-    return payload.as_staging;
-  }
-
-  //
-  // TensorRef
-  //
-
-  explicit Value(TensorRef&& t) : tag(TypeTag::TENSORREF) {
-    payload.as_tensorref = std::move(t);
-  }
-
-  inline bool isTensorRef() const {
-    return TypeTag::TENSORREF == tag;
+  SUPPORT_TRIVIALLY_COPYABLE_TYPE(int64_t, Int, TypeTag::INT, as_int);
+  SUPPORT_TRIVIALLY_COPYABLE_TYPE(double, Double, TypeTag::DOUBLE, as_double);
+  SUPPORT_TRIVIALLY_COPYABLE_TYPE(bool, Bool, TypeTag::BOOL, as_bool);
+
+#undef SUPPORT_TRIVIALLY_COPYABLE_TYPE
+
+#define SUPPORT_TRIVIALLY_MOVEABLE_TYPE(                    \
+    type, type_name, type_tag, member_name)                 \
+  explicit Value(type&& t) : tag(type_tag) {                \
+    new (&payload.member_name) type(std::move(t));          \
+  }                                                         \
+  inline bool is##type_name() const {                       \
+    return tag == type_tag;                                 \
+  }                                                         \
+  inline type& to##type_name() {                            \
+    VK_CHECK_COND(                                          \
+        is##type_name(),                                    \
+        "Expected value to have type " #type_name ", got ", \
+        tag,                                                \
+        " instead.");                                       \
+    return payload.member_name;                             \
   }
 
-  inline TensorRef& toTensorRef() {
-    VK_CHECK_COND(
-        isTensorRef(),
-        "Expected value to have type TENSORREF, got ",
-        tag,
-        " instead.");
-    return payload.as_tensorref;
-  }
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(vTensor, Tensor, TypeTag::TENSOR, as_tensor);
+
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
+      api::StorageBuffer,
+      Staging,
+      TypeTag::STAGING,
+      as_staging);
+
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
+      TensorRef,
+      TensorRef,
+      TypeTag::TENSORREF,
+      as_tensorref);
+
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
+      std::vector<int64_t>,
+      IntList,
+      TypeTag::INTLIST,
+      as_int_list);
+
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
+      std::vector<double>,
+      DoubleList,
+      TypeTag::DOUBLELIST,
+      as_double_list);
+
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
+      std::vector<bool>,
+      BoolList,
+      TypeTag::BOOLLIST,
+      as_bool_list);
+
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
+      std::vector<ValueRef>,
+      ValueList,
+      TypeTag::VALUELIST,
+      as_value_list);
+
+  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
+      std::string,
+      String,
+      TypeTag::STRING,
+      as_string);
+
+#undef SUPPORT_TRIVIALLY_COPYABLE_TYPE
+#undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE
 
  private:
   Payload payload;
@@ -177,18 +278,11 @@ struct Value final {
   //
 
   inline void clearToNone() noexcept {
-    payload.u.as_int = 0;
+    payload.u.as_int = -1;
     tag = TypeTag::NONE;
   }
 };
 
-using ValueRef = int32_t;
-
-struct IOValueRef {
-  ValueRef value;
-  ValueRef staging;
-};
-
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
index 6bdb07e719..e9d5ab18b4 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
@@ -10,12 +10,31 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/Utils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h>
 
 namespace at {
 namespace native {
 namespace vulkan {
 
+ExecuteNode::ExecuteNode(
+    ComputeGraph& graph,
+    const api::ShaderInfo& shader,
+    const api::utils::uvec3& global_workgroup_size,
+    const api::utils::uvec3& local_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params,
+    const ResizeFunction& resize_fn,
+    const std::vector<ValueRef>& resize_args)
+    : shader_(shader),
+      global_workgroup_size_(global_workgroup_size),
+      local_workgroup_size_(local_workgroup_size),
+      args_(args),
+      params_(params),
+      resize_fn_(resize_fn),
+      resize_args_(resize_args) {
+  graph.update_descriptor_counts(shader, /*execute = */ true);
+}
+
 void ExecuteNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
   api::PipelineBarrier pipeline_barrier{};
@@ -27,20 +46,8 @@ void ExecuteNode::encode(ComputeGraph* graph) {
 
   uint32_t idx = 0;
   idx = bind_values_to_descriptor_set(
-      graph,
-      outputs_,
-      pipeline_barrier,
-      api::MemoryAccessType::WRITE,
-      descriptor_set,
-      idx);
-  idx = bind_values_to_descriptor_set(
-      graph,
-      inputs_,
-      pipeline_barrier,
-      api::MemoryAccessType::READ,
-      descriptor_set,
-      idx);
-  descriptor_set.bind(idx, params_.buffer());
+      graph, args_, pipeline_barrier, descriptor_set, idx);
+  bind_params_to_descriptor_set(params_, descriptor_set, idx);
 
   context->register_shader_dispatch(
       descriptor_set, pipeline_barrier, shader_, global_workgroup_size_);
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
index 1b726e73d4..9d9beeab65 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -10,9 +10,7 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Context.h>
-#include <ATen/native/vulkan/api/Tensor.h>
-#include <ATen/native/vulkan/api/Types.h>
+#include <ATen/native/vulkan/api/api.h>
 
 #include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
 
@@ -22,48 +20,67 @@ namespace vulkan {
 
 class ComputeGraph;
 
+/*
+ * Represents a group of shader arguments (images and/or buffers), with a common
+ * access permission.
+ */
+struct ArgGroup {
+  ArgGroup(const ValueRef ref, const api::MemoryAccessType access)
+      : refs{ref}, access(access) {}
+
+  ArgGroup(
+      const std::vector<ValueRef>& refs,
+      const api::MemoryAccessType access)
+      : refs(refs), access(access) {}
+
+  const std::vector<ValueRef> refs;
+  const api::MemoryAccessType access;
+};
+
 /*
  * Represents a single execution op in a ML model. In graph mode, ops will be
  * implemented in a derived class that implements encode, which will implement
  * encoding of the shader corresponding to the op into the command buffer of a
  * ComputeGraph.
  */
-class ExecuteNode {
+class ExecuteNode final {
   friend class ComputeGraph;
 
  public:
-  ExecuteNode(ValueRef input, ValueRef output)
-      : outputs_{output}, inputs_{input} {}
+  using ResizeFunction = const std::function<void(
+      ComputeGraph*,
+      const std::vector<ArgGroup>&,
+      const std::vector<ValueRef>&)>;
 
   ExecuteNode(
+      ComputeGraph& graph,
       const api::ShaderInfo& shader,
       const api::utils::uvec3& global_workgroup_size,
       const api::utils::uvec3& local_workgroup_size,
-      const std::vector<ValueRef>& outputs,
-      const std::vector<ValueRef>& inputs,
-      api::UniformParamsBuffer&& params)
-      : shader_(shader),
-        global_workgroup_size_(global_workgroup_size),
-        local_workgroup_size_(local_workgroup_size),
-        outputs_(outputs),
-        inputs_(inputs),
-        params_(std::move(params)) {}
-
-  virtual ~ExecuteNode() = default;
+      const std::vector<ArgGroup>& args,
+      const std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params,
+      const ResizeFunction& resize_fn = nullptr,
+      const std::vector<ValueRef>& resize_args = {});
+
+  ~ExecuteNode() = default;
+
+  void encode(ComputeGraph* graph);
+
+  inline void trigger_resize(ComputeGraph* graph) {
+    if (resize_fn_ != nullptr) {
+      resize_fn_(graph, args_, resize_args_);
+    }
+  }
 
  protected:
-  // TODO: Consider making members const after we remove StagingNode.
-  api::ShaderInfo shader_;
-  api::utils::uvec3 global_workgroup_size_;
-  api::utils::uvec3 local_workgroup_size_;
-  std::vector<ValueRef> outputs_;
-  std::vector<ValueRef> inputs_;
-  // TODO(T180906086): pass multiple buffers and index with ValueRef.
+  const api::ShaderInfo shader_;
+  const api::utils::uvec3 global_workgroup_size_;
+  const api::utils::uvec3 local_workgroup_size_;
+  const std::vector<ArgGroup> args_;
   // TODO(T180906457): allow re-computing param buffers.
-  api::UniformParamsBuffer params_;
-
- public:
-  virtual void encode(ComputeGraph* graph);
+  std::vector<std::shared_ptr<api::UniformParamsBuffer>> params_;
+  const ResizeFunction resize_fn_;
+  const std::vector<ValueRef> resize_args_;
 };
 
 } // namespace vulkan
diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp b/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp
index 0d46e5b351..9f489e1c3f 100644
--- a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp
+++ b/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp
@@ -8,48 +8,28 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h>
-
 namespace at {
 namespace native {
 namespace vulkan {
 
-bool hasOpsFn(const std::string& name) {
-  return OperatorRegistry::getInstance().hasOpsFn(name);
-}
-
-OpFunction& getOpsFn(const std::string& name) {
-  return OperatorRegistry::getInstance().getOpsFn(name);
+bool OperatorRegistry::has_op(const std::string& name) {
+  return table_.count(name) > 0;
 }
 
-OperatorRegistry& OperatorRegistry::getInstance() {
-  static OperatorRegistry instance;
-  return instance;
+OperatorRegistry::OpFunction& OperatorRegistry::get_op_fn(
+    const std::string& name) {
+  return table_.find(name)->second;
 }
 
-bool OperatorRegistry::hasOpsFn(const std::string& name) {
-  return OperatorRegistry::kTable.count(name) > 0;
+void OperatorRegistry::register_op(const std::string& name, OpFunction& fn) {
+  table_.insert(std::make_pair(name, fn));
 }
 
-OpFunction& OperatorRegistry::getOpsFn(const std::string& name) {
-  return OperatorRegistry::kTable.find(name)->second;
+OperatorRegistry& operator_registry() {
+  static OperatorRegistry registry;
+  return registry;
 }
 
-// @lint-ignore-every CLANGTIDY modernize-avoid-bind
-// clang-format off
-#define OPERATOR_ENTRY(name, function) \
-  { #name, std::bind(&at::native::vulkan::function, std::placeholders::_1, std::placeholders::_2) }
-// clang-format on
-
-const OperatorRegistry::OpTable OperatorRegistry::kTable = {
-    OPERATOR_ENTRY(aten.add.Tensor, add),
-    OPERATOR_ENTRY(aten.sub.Tensor, sub),
-    OPERATOR_ENTRY(aten.mul.Tensor, mul),
-    OPERATOR_ENTRY(aten.div.Tensor, div),
-    OPERATOR_ENTRY(aten.div.Tensor_mode, floor_div),
-    OPERATOR_ENTRY(aten.pow.Tensor_Tensor, pow),
-};
-
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.h b/backends/vulkan/runtime/graph/ops/OperatorRegistry.h
index c11aa0168e..1088ab2e44 100644
--- a/backends/vulkan/runtime/graph/ops/OperatorRegistry.h
+++ b/backends/vulkan/runtime/graph/ops/OperatorRegistry.h
@@ -15,45 +15,68 @@
 #include <functional>
 #include <unordered_map>
 
+#define VK_HAS_OP(name) ::at::native::vulkan::operator_registry().has_op(name)
+
+#define VK_GET_OP_FN(name) \
+  ::at::native::vulkan::operator_registry().get_op_fn(name)
+
+#define VK_REGISTER_OP(name, function)                   \
+  ::at::native::vulkan::operator_registry().register_op( \
+      #name,                                             \
+      std::bind(&function, std::placeholders::_1, std::placeholders::_2))
+
+#define REGISTER_OPERATORS                              \
+  static void register_ops();                           \
+  static const OperatorRegisterInit reg(&register_ops); \
+  static void register_ops()
+
 namespace at {
 namespace native {
 namespace vulkan {
 
-using OpFunction = const std::function<at::native::vulkan::ValueRef(
-    at::native::vulkan::ComputeGraph&,
-    const std::vector<at::native::vulkan::ValueRef>&)>; // TODO: Generalize to
-                                                        // support float,
-                                                        // int64_t.
-
-bool hasOpsFn(const std::string& name);
+/*
+ * The Vulkan operator registry maps ATen operator names to their Vulkan
+ * delegate function implementation. It is a simplified version of
+ * executorch/runtime/kernel/operator_registry.h that uses the C++ Standard
+ * Library.
+ */
+class OperatorRegistry final {
+  using OpFunction =
+      const std::function<void(ComputeGraph&, const std::vector<ValueRef>&)>;
+  using OpTable = std::unordered_map<std::string, OpFunction>;
 
-OpFunction& getOpsFn(const std::string& name);
+  OpTable table_;
 
-// The Vulkan operator registry is a simplified version of
-// fbcode/executorch/runtime/kernel/operator_registry.h
-// that uses the C++ Standard Library.
-class OperatorRegistry {
  public:
-  static OperatorRegistry& getInstance();
+  /*
+   * Check if the registry has an operator registered under the given name
+   */
+  bool has_op(const std::string& name);
 
-  bool hasOpsFn(const std::string& name);
-  OpFunction& getOpsFn(const std::string& name);
+  /*
+   * Given an operator name, return the Vulkan delegate function
+   */
+  OpFunction& get_op_fn(const std::string& name);
 
-  OperatorRegistry(const OperatorRegistry&) = delete;
-  OperatorRegistry(OperatorRegistry&&) = delete;
-  OperatorRegistry& operator=(const OperatorRegistry&) = delete;
-  OperatorRegistry& operator=(OperatorRegistry&&) = delete;
+  /*
+   * Register a function to a given operator name
+   */
+  void register_op(const std::string& name, OpFunction& fn);
+};
 
- private:
-  // TODO: Input string corresponds to target_name. We may need to pass kwargs.
-  using OpTable = std::unordered_map<std::string, OpFunction>;
-  // @lint-ignore CLANGTIDY facebook-hte-NonPodStaticDeclaration
-  static const OpTable kTable;
+class OperatorRegisterInit final {
+  using InitFn = void();
 
-  OperatorRegistry() = default;
-  ~OperatorRegistry() = default;
+ public:
+  explicit OperatorRegisterInit(InitFn* init_fn) {
+    init_fn();
+  }
 };
 
+// The Vulkan operator registry is global. It is retrieved using this function,
+// where it is declared as a static local variable.
+OperatorRegistry& operator_registry();
+
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
new file mode 100644
index 0000000000..c21c1447d9
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+PrepackNode::PrepackNode(
+    ComputeGraph& graph,
+    const api::ShaderInfo& shader,
+    const api::utils::uvec3& global_workgroup_size,
+    const api::utils::uvec3& local_workgroup_size,
+    const ValueRef tref,
+    const ValueRef packed,
+    const std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params)
+    : shader_(shader),
+      global_workgroup_size_(global_workgroup_size),
+      local_workgroup_size_(local_workgroup_size),
+      tref_(tref),
+      packed_(packed),
+      params_(params) {
+  graph.update_descriptor_counts(shader, /*execute = */ false);
+}
+
+void PrepackNode::encode(ComputeGraph* graph) {
+  api::Context* const context = graph->context();
+  api::PipelineBarrier pipeline_barrier{};
+
+  TensorRef tref = graph->get_val(tref_).toTensorRef();
+  vTensor packed = graph->get_val(packed_).toTensor();
+
+  // TODO: Extract to standalone function, to support other types of prepacking.
+  api::StorageBuffer staging(
+      graph->context(), packed.dtype(), packed.gpu_nbytes());
+  size_t numel = api::utils::multiply_integers(tref.sizes);
+  size_t nbytes = numel * api::element_size(tref.dtype);
+  copy_ptr_to_staging(tref.data, staging, nbytes);
+
+  std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
+
+  api::DescriptorSet descriptor_set =
+      context->get_descriptor_set(shader_, local_workgroup_size_);
+
+  uint32_t idx = 0;
+  bind_tensor_to_descriptor_set(
+      packed,
+      pipeline_barrier,
+      api::MemoryAccessType::WRITE,
+      descriptor_set,
+      idx++);
+  bind_staging_to_descriptor_set(staging, descriptor_set, idx++);
+  bind_params_to_descriptor_set(params_, descriptor_set, idx);
+
+  context->register_shader_dispatch(
+      descriptor_set, pipeline_barrier, shader_, global_workgroup_size_);
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
index 6f581eb931..7d8a8b4ce3 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.h
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -10,9 +10,7 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Context.h>
-#include <ATen/native/vulkan/api/Tensor.h>
-#include <ATen/native/vulkan/api/Types.h>
+#include <ATen/native/vulkan/api/api.h>
 
 #include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
 
@@ -28,20 +26,31 @@ class ComputeGraph;
  * encoding of shaders transferring necessary data (such as weights and biases)
  * to the GPU.
  */
-class PrepackNode {
+class PrepackNode final {
   friend class ComputeGraph;
 
  public:
-  PrepackNode(ValueRef tref, ValueRef packed) : tref_{tref}, packed_{packed} {}
+  PrepackNode(
+      ComputeGraph& graph,
+      const api::ShaderInfo& shader,
+      const api::utils::uvec3& global_workgroup_size,
+      const api::utils::uvec3& local_workgroup_size,
+      const ValueRef tref,
+      const ValueRef packed,
+      const std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params);
 
-  virtual ~PrepackNode() = default;
+  ~PrepackNode() = default;
 
- protected:
-  ValueRef tref_;
-  ValueRef packed_;
+  void encode(ComputeGraph* graph);
 
- public:
-  virtual void encode(ComputeGraph* graph) const = 0;
+ protected:
+  const api::ShaderInfo shader_;
+  const api::utils::uvec3 global_workgroup_size_;
+  const api::utils::uvec3 local_workgroup_size_;
+  const ValueRef tref_;
+  const ValueRef packed_;
+  // TODO(T180906457): allow re-computing param buffers.
+  std::vector<std::shared_ptr<api::UniformParamsBuffer>> params_;
 };
 
 } // namespace vulkan
diff --git a/backends/vulkan/runtime/graph/ops/glsl/all_shaders.yaml b/backends/vulkan/runtime/graph/ops/glsl/all_shaders.yaml
new file mode 100644
index 0000000000..a1abc6a745
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/all_shaders.yaml
@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+binary_op:
+  parameter_names_with_default_values:
+    OPERATOR: X + A * Y
+    NDIM: 3
+    DTYPE: float
+    PACKING: CHANNELS_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: binary_add
+    - NAME: binary_sub
+      OPERATOR: X - A * Y
+    - NAME: binary_mul
+      OPERATOR: X * Y
+    - NAME: binary_div
+      OPERATOR: X / Y
+    - NAME: binary_pow
+      OPERATOR: pow(X, Y)
+    - NAME: binary_floor_divide
+      OPERATOR: floor(X / Y)
+
+image_to_nchw:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: CHANNELS_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: image3d_to_nchw_C_packed
+    - NAME: image2d_to_nchw_C_packed
+      NDIM: 2
+
+nchw_to_image:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: CHANNELS_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: nchw_to_image3d_C_packed
+    - NAME: nchw_to_image2d_C_packed
+      NDIM: 2
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
new file mode 100644
index 0000000000..f7bcdaa232
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#include "broadcasting_utils.h"
+#include "indexing_utils.h"
+
+#define PRECISION ${PRECISION}
+
+#define OP(X, Y, A) ${OPERATOR}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D image_other;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
+  ivec4 data;
+}
+out_sizes;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
+  ivec4 data;
+}
+in_sizes;
+
+layout(set = 0, binding = 5) uniform PRECISION restrict OtherSizes {
+  ivec4 data;
+}
+other_sizes;
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Alpha {
+  float data;
+}
+alpha;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, out_sizes.data);
+
+  if (any(greaterThanEqual(coord, out_sizes.data))) {
+    return;
+  }
+
+  ivec4 in_coord = out_coord_to_in_coord(coord, in_sizes.data);
+  vec4 in_texel = texelFetch(
+    image_in,
+    COORD_TO_POS_${PACKING}(in_coord, in_sizes.data),
+    0);
+
+  ivec4 other_coord = out_coord_to_in_coord(coord, other_sizes.data);
+  vec4 other_texel = texelFetch(
+    image_other,
+    COORD_TO_POS_${PACKING}(other_coord, other_sizes.data),
+    0);
+
+  imageStore(image_out, pos, OP(in_texel, other_texel, alpha.data));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h b/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h
new file mode 100644
index 0000000000..dc8635b881
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+ivec4 out_coord_to_in_coord(const ivec4 out_coord, const ivec4 in_sizes) {
+  ivec4 in_coord = out_coord;
+  for (int i = 0; i < 4; ++i) {
+    if (in_sizes[i] == 1) {
+      in_coord[i] = 0;
+    }
+  }
+  return in_coord;
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
new file mode 100644
index 0000000000..f966f7584b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in;
+layout(set = 0, binding = 1) buffer PRECISION restrict writeonly Buffer {
+  ${T[DTYPE]} data[];
+}
+buffer_out;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
+  ivec4 data;
+}
+gpu_sizes;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict CpuSizes {
+  ivec4 data;
+}
+cpu_sizes;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data);
+
+  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+    return;
+  }
+
+  const ${VEC4_T[DTYPE]} intex = texelFetch(image_in, ${GET_POS[NDIM]("pos")}, 0);
+
+  const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * (gpu_sizes.data.x * gpu_sizes.data.y);
+
+  if (coord.z < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.x] = intex.x;
+  }
+  if (coord.z + 1 < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.y] = intex.y;
+  }
+  if (coord.z + 2 < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.z] = intex.z;
+  }
+  if (coord.z + 3 < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.w] = intex.w;
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
new file mode 100644
index 0000000000..7bac6b5116
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#define POS_TO_COORD_CHANNELS_PACKED(pos, sizes) \
+  ivec4(pos.x, pos.y, (pos.z * 4) % sizes.z, (pos.z * 4) / sizes.z)
+
+#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
+  ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)
+
+#define COORD_TO_BUFFER_IDX(coord, sizes)                  \
+  coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
+      coord.w* sizes.z* sizes.y* sizes.x;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
new file mode 100644
index 0000000000..00ed3fe5e4
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
+  ${T[DTYPE]} data[];
+}
+buffer_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
+  ivec4 data;
+}
+gpu_sizes;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict CpuSizes {
+  ivec4 data;
+}
+cpu_sizes;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data);
+
+  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+    return;
+  }
+
+  const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * (gpu_sizes.data.x * gpu_sizes.data.y);
+
+  ${T[DTYPE]} val_x = buffer_in.data[buf_indices.x];
+  ${T[DTYPE]} val_y = buffer_in.data[buf_indices.y];
+  ${T[DTYPE]} val_z = buffer_in.data[buf_indices.z];
+  ${T[DTYPE]} val_w = buffer_in.data[buf_indices.w];
+
+  ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w);
+
+  if (coord.z + 3 >= cpu_sizes.data.z) {
+    ivec4 c_ind = ivec4(coord.z) + ivec4(0, 1, 2, 3);
+    vec4 valid_c = vec4(lessThan(c_ind, ivec4(cpu_sizes.data.z)));
+    texel = texel * valid_c;
+  }
+
+  imageStore(image_out, ${GET_POS[NDIM]("pos")}, texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp
deleted file mode 100644
index ce43005384..0000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-#define DEFINE_ARITHMETIC_FN(function, shader)                                \
-  ValueRef function(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_arithmetic_node(                                               \
-        graph, args[0], args[1], args[2], VK_KERNEL(shader), args[3]);        \
-  }
-
-DEFINE_ARITHMETIC_FN(add, add);
-DEFINE_ARITHMETIC_FN(sub, sub);
-DEFINE_ARITHMETIC_FN(mul, mul);
-DEFINE_ARITHMETIC_FN(div, div);
-DEFINE_ARITHMETIC_FN(floor_div, floor_divide);
-DEFINE_ARITHMETIC_FN(pow, pow);
-
-// TODO(T180908843): Bypass this entrypoint function by creating `ValueRef out`
-// ahead of time.
-ValueRef add_arithmetic_node(
-    ComputeGraph& graph,
-    const ValueRef in1,
-    const ValueRef in2,
-    const float alpha,
-    const api::ShaderInfo& shader,
-    const int64_t shared_object_idx) {
-  std::vector<int64_t> in1_sizes = graph.get_val_sizes(in1);
-  api::ScalarType in1_dtype = graph.get_val_dtype(in1);
-
-  ValueRef out = graph.add_tensor(in1_sizes, in1_dtype, shared_object_idx);
-  add_arithmetic_node(graph, in1, in2, out, alpha, shader);
-  return out;
-}
-
-// TODO(T181006464): Move to Utils when we remove ArithmeticPrepack.
-ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) {
-  if (graph.get_val(v).isTensor()) {
-    return v;
-  } else {
-    TensorRef& tRef = graph.get_val(v).toTensorRef();
-    ValueRef vTen = graph.add_tensor(tRef.sizes, tRef.dtype);
-    graph.prepack_nodes().emplace_back(new ArithmeticPrepack(v, vTen));
-    return vTen;
-  }
-}
-
-void add_arithmetic_node(
-    ComputeGraph& graph,
-    const ValueRef in1,
-    const ValueRef in2,
-    const ValueRef out,
-    const float alpha,
-    const api::ShaderInfo& shader) {
-  ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
-  ValueRef arg2 = prepack_if_tensor_ref(graph, in2);
-
-  vTensor& t_in1 = graph.get_val(arg1).toTensor();
-  vTensor& t_in2 = graph.get_val(arg2).toTensor();
-  vTensor& t_out = graph.get_val(out).toTensor();
-
-  api::utils::uvec3 global_size = t_out.extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
-  ArithmeticParams block{
-      get_size_as_ivec4(t_out),
-      get_size_as_ivec4(t_in1),
-      get_size_as_ivec4(t_in2),
-      1.0,
-  };
-  api::UniformParamsBuffer params(graph.context(), block);
-
-  graph.execute_nodes().emplace_back(new ExecuteNode(
-      shader, global_size, local_size, {out}, {arg1, arg2}, std::move(params)));
-}
-
-ArithmeticPrepack::ArithmeticPrepack(const ValueRef tref, const ValueRef packed)
-    : PrepackNode(tref, packed) {}
-
-void ArithmeticPrepack::encode(ComputeGraph* graph) const {
-  TensorRef tref = graph->get_val(tref_).toTensorRef();
-  vTensor packed = graph->get_val(packed_).toTensor();
-
-  api::StorageBuffer staging(
-      graph->context(), packed.dtype(), packed.gpu_nbytes());
-
-  size_t numel = api::utils::multiply_integers(tref.sizes);
-  size_t nbytes = numel * api::element_size(tref.dtype);
-  copy_ptr_to_staging(tref.data, staging, nbytes);
-
-  encode_copy_to_vtensor(graph->context(), staging, packed);
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h
deleted file mode 100644
index 82e2aa2cdf..0000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <ATen/native/vulkan/impl/Arithmetic.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/Utils.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-DECLARE_OP_FN(add);
-DECLARE_OP_FN(sub);
-DECLARE_OP_FN(mul);
-DECLARE_OP_FN(div);
-DECLARE_OP_FN(floor_div);
-DECLARE_OP_FN(pow);
-
-ValueRef add_arithmetic_node(
-    ComputeGraph& graph,
-    const ValueRef in1,
-    const ValueRef in2,
-    const float alpha,
-    const api::ShaderInfo& shader,
-    const int64_t shared_object_idx = -1);
-
-void add_arithmetic_node(
-    ComputeGraph& graph,
-    const ValueRef in1,
-    const ValueRef in2,
-    const ValueRef out,
-    const float alpha,
-    const api::ShaderInfo& shader);
-
-struct ArithmeticParams final {
-  api::utils::ivec4 outputSizes;
-  api::utils::ivec4 input1Sizes;
-  api::utils::ivec4 input2Sizes;
-  float alpha;
-};
-
-class ArithmeticPrepack : public virtual PrepackNode {
- public:
-  explicit ArithmeticPrepack(const ValueRef tref, const ValueRef packed);
-
-  void encode(ComputeGraph* graph) const override;
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif /* USE_VULKAN_API */
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
new file mode 100644
index 0000000000..1d637ecb34
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void resize_binary_op_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensor& out = graph->get_val(args[0].refs[0]).toTensor();
+  vTensor& self = graph->get_val(args[1].refs[0]).toTensor();
+  vTensor& other = graph->get_val(args[1].refs[1]).toTensor();
+
+  std::vector<int64_t> new_out_sizes(
+      std::max(self.sizes().size(), other.sizes().size()));
+
+  // Match the sizes in reverse because sizes are in NCHW order
+  for (int i = -1; i >= -new_out_sizes.size(); --i) {
+    new_out_sizes.at(new_out_sizes.size() + i) = std::max(
+        api::utils::val_at(i, self.sizes()),
+        api::utils::val_at(i, other.sizes()));
+  }
+
+  out.virtual_resize(new_out_sizes);
+}
+
+void add_binary_op_node(
+    ComputeGraph& graph,
+    const ValueRef in1,
+    const ValueRef in2,
+    const ValueRef alpha,
+    const ValueRef out,
+    const std::string& op_name) {
+  ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
+  ValueRef arg2 = prepack_if_tensor_ref(graph, in2);
+
+  vTensor& t_in1 = graph.get_val(arg1).toTensor();
+  vTensor& t_in2 = graph.get_val(arg2).toTensor();
+  vTensor& t_out = graph.get_val(out).toTensor();
+
+  api::utils::uvec3 global_size = t_out.virtual_extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  float alpha_val = 1.0f;
+  // String is checked since floor_div passes in an unused string argument in
+  // place of alpha
+  if (is_valid(alpha) && !graph.get_val(alpha).isString()) {
+    alpha_val = extract_scalar<float>(graph.get_val(alpha));
+  }
+
+  std::stringstream kernel_name;
+  kernel_name << "binary_" << op_name;
+  apply_dtype_suffix(kernel_name, t_out);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name.str()),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE},
+       {{arg1, arg2}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {t_out.gpu_sizes_ubo(),
+       t_in1.gpu_sizes_ubo(),
+       t_in2.gpu_sizes_ubo(),
+       graph.create_params_buffer(alpha_val)},
+      // Resizing
+      resize_binary_op_node));
+}
+
+#define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name)                          \
+  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
+    return add_binary_op_node(                                           \
+        graph, args[0], args[1], args[2], args[3], #op_name);            \
+  }
+
+#define DEFINE_BINARY_OP_FN(op_name)                                     \
+  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
+    return add_binary_op_node(                                           \
+        graph, args[0], args[1], kDummyValueRef, args[2], #op_name);     \
+  }
+
+DEFINE_BINARY_OP_WITH_ALPHA_FN(add);
+DEFINE_BINARY_OP_WITH_ALPHA_FN(sub);
+
+// Floor div does not have an alpha, but a string argument (which is unused) is
+// passed in at the same location as the alpha argument in other op.
+DEFINE_BINARY_OP_WITH_ALPHA_FN(floor_divide);
+
+DEFINE_BINARY_OP_FN(mul);
+DEFINE_BINARY_OP_FN(div);
+DEFINE_BINARY_OP_FN(pow);
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.add.Tensor, add);
+  VK_REGISTER_OP(aten.sub.Tensor, sub);
+  VK_REGISTER_OP(aten.mul.Tensor, mul);
+  VK_REGISTER_OP(aten.div.Tensor, div);
+  VK_REGISTER_OP(aten.div.Tensor_mode, floor_divide);
+  VK_REGISTER_OP(aten.pow.Tensor_Tensor, pow);
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 5b16780777..b3319e6dac 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -8,116 +8,86 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
-#include <ATen/native/vulkan/impl/Packing.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 
 namespace at {
 namespace native {
 namespace vulkan {
 
-void memcpy_to_mapping(
-    const void* src,
-    api::MemoryMap& dst_mapping,
-    const size_t nbytes,
-    const api::ScalarType dtype) {
-#define DTYPE_CASE(ctype, vkformat, name)                    \
-  case api::ScalarType::name:                                \
-    memcpy_to_mapping_impl<ctype>(src, dst_mapping, nbytes); \
-    break;
-
-  switch (dtype) {
-    VK_FORALL_SCALAR_TYPES(DTYPE_CASE)
-    default:
-      VK_THROW("Unrecognized dtype!");
-  }
-#undef DTYPE_CASE
-}
-
-void memcpy_from_mapping(
-    api::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes,
-    const api::ScalarType dtype) {
-#define DTYPE_CASE(ctype, vkformat, name)                      \
-  case api::ScalarType::name:                                  \
-    memcpy_from_mapping_impl<ctype>(src_mapping, dst, nbytes); \
-    break;
-
-  switch (dtype) {
-    VK_FORALL_SCALAR_TYPES(DTYPE_CASE)
-    default:
-      VK_THROW("Unrecognized dtype!");
-  }
-#undef DTYPE_CASE
-}
+void add_staging_to_tensor_node(
+    ComputeGraph& graph,
+    const ValueRef in_staging,
+    const ValueRef out_tensor) {
+  vTensor& t_out = graph.get_val(out_tensor).toTensor();
+  VK_CHECK_COND(graph.get_val(in_staging).isStaging());
 
-void copy_ptr_to_staging(
-    const void* src,
-    api::StorageBuffer& staging,
-    const size_t nbytes) {
-  api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
-  mapping.invalidate();
-  memcpy_to_mapping(src, mapping, nbytes, staging.dtype());
-}
+  api::ShaderInfo shader = get_nchw_to_image_shader(t_out);
 
-void copy_staging_to_ptr(
-    api::StorageBuffer& staging,
-    void* dst,
-    const size_t nbytes) {
-  api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::READ);
-  mapping.invalidate();
-  memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
-}
+  api::utils::uvec3 global_size = t_out.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-void encode_copy_to_vtensor(
-    api::Context* context,
-    api::StorageBuffer& staging,
-    vTensor& tensor) {
-  api::ShaderInfo shader = packing::get_nchw_to_image_shader(tensor);
-  api::PipelineBarrier pipeline_barrier{};
-  packing::record_nchw_to_image_op(
-      context,
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
       shader,
-      staging.buffer(),
-      tensor,
-      pipeline_barrier,
-      VK_NULL_HANDLE);
+      global_size,
+      local_size,
+      {{out_tensor, api::MemoryAccessType::WRITE},
+       {in_staging, api::MemoryAccessType::READ}},
+      {t_out.gpu_sizes_ubo(), t_out.cpu_sizes_ubo()}));
 }
 
-void encode_copy_from_vtensor(
-    api::Context* context,
-    vTensor& tensor,
-    api::StorageBuffer& staging) {
-  api::ShaderInfo shader = packing::get_image_to_nchw_shader(tensor);
-  api::PipelineBarrier pipeline_barrier{};
-  packing::record_image_to_nchw_op(
-      context,
+void add_tensor_to_staging_node(
+    ComputeGraph& graph,
+    const ValueRef in_tensor,
+    const ValueRef out_staging) {
+  vTensor& t_in = graph.get_val(in_tensor).toTensor();
+  VK_CHECK_COND(graph.get_val(out_staging).isStaging());
+
+  api::ShaderInfo shader = get_image_to_nchw_shader(t_in);
+
+  api::utils::uvec3 global_size = t_in.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
       shader,
-      tensor,
-      staging.buffer(),
-      pipeline_barrier,
-      VK_NULL_HANDLE);
+      global_size,
+      local_size,
+      {{in_tensor, api::MemoryAccessType::READ},
+       {out_staging, api::MemoryAccessType::WRITE}},
+      {t_in.gpu_sizes_ubo(), t_in.cpu_sizes_ubo()}));
 }
 
-StagingNode::StagingNode(ValueRef from, ValueRef to) : ExecuteNode(from, to) {}
+ValueRef prepack(ComputeGraph& graph, const ValueRef vref) {
+  TensorRef& tref = graph.get_val(vref).toTensorRef();
+  ValueRef v = graph.add_tensor(tref.sizes, tref.dtype);
+  vTensor t = graph.get_val(v).toTensor();
+
+  api::ShaderInfo shader = get_nchw_to_image_shader(t);
 
-void StagingNode::encode(ComputeGraph* graph) {
-  Value& in_val = graph->get_val(inputs_[0]);
-  Value& out_val = graph->get_val(outputs_[0]);
+  api::utils::uvec3 global_size = t.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      shader,
+      global_size,
+      local_size,
+      vref,
+      v,
+      {t.gpu_sizes_ubo(), t.cpu_sizes_ubo()}));
+
+  return v;
+}
 
-  if (in_val.isStaging() && out_val.isTensor()) {
-    api::StorageBuffer& from_staging = graph->get_val(inputs_[0]).toStaging();
-    vTensor& to_tensor = graph->get_val(outputs_[0]).toTensor();
-    encode_copy_to_vtensor(graph->context(), from_staging, to_tensor);
-  } else if (in_val.isTensor() && out_val.isStaging()) {
-    vTensor& from_tensor = graph->get_val(inputs_[0]).toTensor();
-    api::StorageBuffer& to_staging = graph->get_val(outputs_[0]).toStaging();
-    encode_copy_from_vtensor(graph->context(), from_tensor, to_staging);
+ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) {
+  if (graph.get_val(v).isTensorRef()) {
+    return prepack(graph, v);
   } else {
-    VK_THROW(
-        "Unexpected input value type ",
-        in_val.type(),
-        " and output value type ",
-        out_val.type());
+    return v;
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h
index be57a9817f..425d77489f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h
@@ -12,84 +12,23 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
 
-#include <string.h>
+#include <cstring>
 
 namespace at {
 namespace native {
 namespace vulkan {
 
-//
-// Functions to memcpy data into staging buffer
-//
+void add_staging_to_tensor_node(
+    ComputeGraph& graph,
+    const ValueRef in_staging,
+    const ValueRef out_tensor);
 
-void memcpy_to_mapping(
-    const void* src,
-    api::MemoryMap& dst_mapping,
-    const size_t nbytes,
-    const api::ScalarType dtype);
-void memcpy_from_mapping(
-    const api::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes,
-    const api::ScalarType dtype);
+void add_tensor_to_staging_node(
+    ComputeGraph& graph,
+    const ValueRef in_tensor,
+    const ValueRef out_staging);
 
-//
-// Utility functions for memcpy
-//
-
-template <typename T>
-void memcpy_to_mapping_impl(
-    const void* src,
-    api::MemoryMap& dst_mapping,
-    const size_t nbytes) {
-  T* data_ptr = dst_mapping.template data<T>();
-  memcpy(data_ptr, reinterpret_cast<const T*>(src), nbytes);
-}
-
-template <typename T>
-void memcpy_from_mapping_impl(
-    api::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes) {
-  T* data_ptr = src_mapping.template data<T>();
-  memcpy(reinterpret_cast<T*>(dst), data_ptr, nbytes);
-}
-
-//
-// Functions to copy data into and out of a staging buffer
-//
-
-void copy_ptr_to_staging(
-    const void* src,
-    api::StorageBuffer& staging,
-    const size_t nbytes);
-void copy_staging_to_ptr(
-    api::StorageBuffer& staging,
-    void* dst,
-    const size_t nbytes);
-
-//
-// Functions to record copying data between a staging buffer and a vTensor
-//
-
-void encode_copy_to_vtensor(
-    api::Context* context,
-    api::StorageBuffer& staging,
-    vTensor& tensor);
-void encode_copy_from_vtensor(
-    api::Context* context,
-    vTensor& tensor,
-    api::StorageBuffer& staging);
-
-/*
- * OpNode that allows copying data into and out of a staging buffer.
- */
-class StagingNode : public virtual ExecuteNode {
- public:
-  explicit StagingNode(ValueRef from, ValueRef to);
-
-  void encode(ComputeGraph* graph) override;
-};
+ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v);
 
 } // namespace vulkan
 } // namespace native
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
new file mode 100644
index 0000000000..299c3bb99f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/api.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+/*
+ * Maps a semantic dimension name to an integer that corresponds to its
+ * innermost ordering in a 4D tensor in NCHW format. Width is the innermost
+ * dimension, so it corresponds to 1, height is the next innermost, so it
+ * corresponds to 2, and so on.
+ */
+struct Dim4D {
+  static constexpr uint32_t Width = 1u;
+  static constexpr uint32_t Height = 2u;
+  static constexpr uint32_t Channel = 3u;
+  static constexpr uint32_t Batch = 4u;
+};
+
+/*
+ * Semantic dimension names for a 1D tensor
+ */
+struct Dim1D {
+  static constexpr uint32_t Length = 1u;
+};
+
+/*
+ * Semantic dimension names for a 2D Convolution kernel.
+ */
+struct DimConv2DKernel {
+  static constexpr uint32_t Width = 1u;
+  static constexpr uint32_t Height = 2u;
+  static constexpr uint32_t InChannels = 3u;
+  static constexpr uint32_t OutChannels = 4u;
+};
+
+/*
+ * The same as the above, except for a 2D Transposed Convolution kernel.
+ */
+struct DimTConv2DKernel {
+  static constexpr uint32_t Width = 1u;
+  static constexpr uint32_t Height = 2u;
+  static constexpr uint32_t OutChannels = 3u;
+  static constexpr uint32_t InChannels = 4u;
+};
+
+/*
+ * The functions below safely return the size of the dimension at the N-th
+ * innermost index. If the dimensionality of the size array is not sufficient
+ * then 1 will be returned. The structs above are intended to be used with
+ * these functions.
+ */
+template <uint32_t N>
+uint32_t dim_at(const std::vector<int64_t>& sizes) {
+  const uint32_t dims = sizes.size();
+  return dims < N ? 1 : api::utils::safe_downcast<uint32_t>(sizes[dims - N]);
+}
+
+template <uint32_t N>
+uint32_t dim_at(const vTensor& v_in) {
+  return dim_at<N>(v_in.sizes());
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h
new file mode 100644
index 0000000000..38cb8eed3b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+template <typename T>
+T extract_scalar(const Value& value) {
+  if (value.isInt()) {
+    return static_cast<T>(value.toInt());
+  }
+  if (value.isDouble()) {
+    return static_cast<T>(value.toDouble());
+  }
+  if (value.isBool()) {
+    return static_cast<T>(value.toBool());
+  }
+  VK_THROW("Cannot extract scalar from Value with type ", value.type());
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
new file mode 100644
index 0000000000..72e1bc5a0d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+api::utils::uvec3 adaptive_work_group_size(
+    const api::utils::uvec3& global_work_group) {
+  api::utils::uvec3 local_group_size = {4, 4, 4};
+  if (global_work_group.data[2u] == 1) {
+    if (global_work_group.data[1u] < 8) {
+      local_group_size.data[0u] = 16;
+      local_group_size.data[1u] = 4;
+      local_group_size.data[2u] = 1;
+    } else {
+      local_group_size.data[0u] = 8;
+      local_group_size.data[1u] = 8;
+      local_group_size.data[2u] = 1;
+    }
+  }
+  return local_group_size;
+}
+
+api::utils::ivec4 get_size_as_ivec4(const vTensor& t) {
+  return api::utils::make_ivec4(
+      {dim_at<Dim4D::Width>(t),
+       dim_at<Dim4D::Height>(t),
+       dim_at<Dim4D::Channel>(t),
+       dim_at<Dim4D::Batch>(t)});
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
new file mode 100644
index 0000000000..a01e8c0a4d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/api.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+api::utils::uvec3 adaptive_work_group_size(
+    const api::utils::uvec3& global_work_group);
+
+api::utils::ivec4 get_size_as_ivec4(const vTensor& t);
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/backends/vulkan/runtime/graph/ops/Utils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
similarity index 52%
rename from backends/vulkan/runtime/graph/ops/Utils.cpp
rename to backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
index 579eac54e3..6e1d9b3013 100644
--- a/backends/vulkan/runtime/graph/ops/Utils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
@@ -6,20 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/vulkan/runtime/graph/ops/Utils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h>
 
 namespace at {
 namespace native {
 namespace vulkan {
 
-api::utils::ivec4 get_size_as_ivec4(const vTensor& t) {
-  return api::utils::make_ivec4(
-      {dim_at<Dim4D::Width>(t),
-       dim_at<Dim4D::Height>(t),
-       dim_at<Dim4D::Channel>(t),
-       dim_at<Dim4D::Batch>(t)});
-}
-
 void bind_tensor_to_descriptor_set(
     vTensor& tensor,
     api::PipelineBarrier& pipeline_barrier,
@@ -39,25 +31,49 @@ void bind_tensor_to_descriptor_set(
 
 uint32_t bind_values_to_descriptor_set(
     ComputeGraph* graph,
-    const std::vector<ValueRef>& args,
+    const std::vector<ArgGroup>& args,
     api::PipelineBarrier& pipeline_barrier,
-    const api::MemoryAccessType accessType,
     api::DescriptorSet& descriptor_set,
     const uint32_t base_idx) {
   uint32_t idx = base_idx;
   for (auto& arg : args) {
-    Value& val = graph->get_val(arg);
-    if (val.isTensor()) {
-      vTensor& tensor = val.toTensor();
-      bind_tensor_to_descriptor_set(
-          tensor, pipeline_barrier, accessType, descriptor_set, idx++);
-    } else {
-      VK_THROW("Unsupported type: ", val.type());
+    for (auto& ref : arg.refs) {
+      Value& val = graph->get_val(ref);
+      if (val.isTensor()) {
+        bind_tensor_to_descriptor_set(
+            val.toTensor(),
+            pipeline_barrier,
+            arg.access,
+            descriptor_set,
+            idx++);
+      } else if (val.isStaging()) {
+        bind_staging_to_descriptor_set(val.toStaging(), descriptor_set, idx++);
+      } else {
+        VK_THROW("Unsupported type: ", val.type());
+      }
     }
   }
   return idx;
 }
 
+uint32_t bind_params_to_descriptor_set(
+    std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params,
+    api::DescriptorSet& descriptor_set,
+    const uint32_t base_idx) {
+  uint32_t idx = base_idx;
+  for (auto& param : params) {
+    descriptor_set.bind(idx++, param->buffer());
+  }
+  return idx;
+}
+
+void bind_staging_to_descriptor_set(
+    api::StorageBuffer& staging,
+    api::DescriptorSet& descriptor_set,
+    const uint32_t idx) {
+  descriptor_set.bind(idx, staging.buffer());
+}
+
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/Utils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
similarity index 67%
rename from backends/vulkan/runtime/graph/ops/Utils.h
rename to backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
index 9cf214ca87..e8d508b791 100644
--- a/backends/vulkan/runtime/graph/ops/Utils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
@@ -10,18 +10,15 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/impl/Common.h>
-
 #include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
 
 namespace at {
 namespace native {
 namespace vulkan {
 
-#define DECLARE_OP_FN(function) \
-  ValueRef function(ComputeGraph& graph, const std::vector<ValueRef>& args);
-
-api::utils::ivec4 get_size_as_ivec4(const vTensor& t);
+//
+// For objects in the graph
+//
 
 void bind_tensor_to_descriptor_set(
     vTensor& tensor,
@@ -32,12 +29,25 @@ void bind_tensor_to_descriptor_set(
 
 uint32_t bind_values_to_descriptor_set(
     ComputeGraph* graph,
-    const std::vector<ValueRef>& args,
+    const std::vector<ArgGroup>& args,
     api::PipelineBarrier& pipeline_barrier,
-    const api::MemoryAccessType accessType,
     api::DescriptorSet& descriptor_set,
     const uint32_t base_idx);
 
+//
+// For objects NOT in the graph
+//
+
+uint32_t bind_params_to_descriptor_set(
+    std::vector<std::shared_ptr<api::UniformParamsBuffer>>& params,
+    api::DescriptorSet& descriptor_set,
+    const uint32_t base_idx);
+
+void bind_staging_to_descriptor_set(
+    api::StorageBuffer& staging,
+    api::DescriptorSet& descriptor_set,
+    const uint32_t idx);
+
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
new file mode 100644
index 0000000000..e941f32e16
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void apply_dtype_suffix(std::stringstream& kernel_name, const vTensor& tensor) {
+  switch (tensor.image().format()) {
+    case VK_FORMAT_R32G32B32A32_SFLOAT:
+      kernel_name << "_float";
+      break;
+    case VK_FORMAT_R16G16B16A16_SFLOAT:
+      kernel_name << "_half";
+      break;
+    case VK_FORMAT_R32G32B32A32_SINT:
+      kernel_name << "_int";
+      break;
+    default:
+      break;
+  }
+}
+
+void apply_memory_layout_suffix(
+    std::stringstream& kernel_name,
+    const vTensor& tensor) {
+  switch (tensor.gpu_memory_layout()) {
+    case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
+      kernel_name << "_C_packed";
+      break;
+    case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
+      kernel_name << "_H_packed";
+      break;
+    case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
+      kernel_name << "_W_packed";
+      break;
+    default:
+      break;
+  }
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
new file mode 100644
index 0000000000..b4c6c3a6bc
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/api.h>
+
+#include <sstream>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void apply_dtype_suffix(std::stringstream& kernel_name, const vTensor& tensor);
+
+void apply_memory_layout_suffix(
+    std::stringstream& kernel_name,
+    const vTensor& tensor);
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
new file mode 100644
index 0000000000..45307c8a9d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// @lint-ignore-every CLANGTIDY facebook-security-vulnerable-memcpy
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+
+#include <cstring>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+template <typename T>
+void memcpy_to_mapping_impl(
+    const void* src,
+    api::MemoryMap& dst_mapping,
+    const size_t nbytes) {
+  T* data_ptr = dst_mapping.template data<T>();
+  memcpy(data_ptr, reinterpret_cast<const T*>(src), nbytes);
+}
+
+template <typename T>
+void memcpy_from_mapping_impl(
+    api::MemoryMap& src_mapping,
+    void* dst,
+    const size_t nbytes) {
+  T* data_ptr = src_mapping.template data<T>();
+  memcpy(reinterpret_cast<T*>(dst), data_ptr, nbytes);
+}
+
+void memcpy_to_mapping(
+    const void* src,
+    api::MemoryMap& dst_mapping,
+    const size_t nbytes,
+    const api::ScalarType dtype) {
+#define DTYPE_CASE(ctype, vkformat, name)                    \
+  case api::ScalarType::name:                                \
+    memcpy_to_mapping_impl<ctype>(src, dst_mapping, nbytes); \
+    break;
+
+  switch (dtype) {
+    VK_FORALL_SCALAR_TYPES(DTYPE_CASE)
+    default:
+      VK_THROW("Unrecognized dtype!");
+  }
+#undef DTYPE_CASE
+}
+
+void memcpy_from_mapping(
+    api::MemoryMap& src_mapping,
+    void* dst,
+    const size_t nbytes,
+    const api::ScalarType dtype) {
+#define DTYPE_CASE(ctype, vkformat, name)                      \
+  case api::ScalarType::name:                                  \
+    memcpy_from_mapping_impl<ctype>(src_mapping, dst, nbytes); \
+    break;
+
+  switch (dtype) {
+    VK_FORALL_SCALAR_TYPES(DTYPE_CASE)
+    default:
+      VK_THROW("Unrecognized dtype!");
+  }
+#undef DTYPE_CASE
+}
+
+void copy_ptr_to_staging(
+    const void* src,
+    api::StorageBuffer& staging,
+    const size_t nbytes) {
+  api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
+  mapping.invalidate();
+  memcpy_to_mapping(src, mapping, nbytes, staging.dtype());
+}
+
+void copy_staging_to_ptr(
+    api::StorageBuffer& staging,
+    void* dst,
+    const size_t nbytes) {
+  api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::READ);
+  mapping.invalidate();
+  memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
+}
+
+api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) {
+  if (v_dst.is_quantized()) {
+    VK_THROW("Quantized Tensors are currently not supported!");
+  }
+
+  std::stringstream kernel_name;
+
+  switch (v_dst.storage_type()) {
+    case api::StorageType::TEXTURE_3D:
+      kernel_name << "nchw_to_image3d";
+      break;
+    case api::StorageType::TEXTURE_2D:
+      kernel_name << "nchw_to_image2d";
+      break;
+    default:
+      VK_THROW("No kernel available!");
+  }
+
+  apply_memory_layout_suffix(kernel_name, v_dst);
+  apply_dtype_suffix(kernel_name, v_dst);
+
+  return VK_KERNEL_FROM_STR(kernel_name.str());
+}
+
+api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) {
+  if (v_src.is_quantized()) {
+    VK_THROW("Quantized Tensors are currently not supported!");
+  }
+
+  std::stringstream kernel_name;
+
+  switch (v_src.storage_type()) {
+    case api::StorageType::TEXTURE_3D:
+      kernel_name << "image3d_to_nchw";
+      break;
+    case api::StorageType::TEXTURE_2D:
+      kernel_name << "image2d_to_nchw";
+      break;
+    default:
+      VK_THROW("No kernel available!");
+  }
+
+  apply_memory_layout_suffix(kernel_name, v_src);
+  apply_dtype_suffix(kernel_name, v_src);
+
+  return VK_KERNEL_FROM_STR(kernel_name.str());
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
new file mode 100644
index 0000000000..2e5de6efb0
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+//
+// Functions to copy data into and out of a staging buffer
+//
+
+void copy_ptr_to_staging(
+    const void* src,
+    api::StorageBuffer& staging,
+    const size_t nbytes);
+void copy_staging_to_ptr(
+    api::StorageBuffer& staging,
+    void* dst,
+    const size_t nbytes);
+
+//
+// Functions to get shaders
+//
+
+api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst);
+api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src);
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/backends/vulkan/serialization/schema.fbs b/backends/vulkan/serialization/schema.fbs
index 3d8dab9a2f..36f6120025 100644
--- a/backends/vulkan/serialization/schema.fbs
+++ b/backends/vulkan/serialization/schema.fbs
@@ -10,9 +10,13 @@ table OperatorCall {
   args:[int];
 }
 
-enum VkDataType : short {
-  // IEEE754 single-precision floating-point.
-  fp32 = 0,
+enum VkDataType : byte {
+  BOOL = 0,
+  UINT8 = 1,
+  INT8 = 2,
+  INT32 = 3,
+  FLOAT16 = 4,
+  FLOAT32 = 5,
 }
 
 table VkTensor {
@@ -26,8 +30,55 @@ table VkTensor {
   mem_obj_id:int;
 }
 
+table Null {}
+
+table Int {
+  int_val:long;
+}
+
+table Bool {
+  bool_val:bool;
+}
+
+table Double {
+  double_val:double;
+}
+
+table String {
+  string_val:string;
+}
+
+table IntList {
+  items:[long];
+}
+
+table DoubleList {
+  items:[double];
+}
+
+table BoolList {
+  items:[bool];
+}
+
+table ValueList {
+  items:[int];
+}
+
+union GraphTypes {
+  Null,
+  Int,
+  Double,
+  Bool,
+  VkTensor,
+  IntList,
+  DoubleList,
+  BoolList,
+  ValueList,
+  String,
+}
+
 table VkValue {
-  value:VkTensor;
+  value:GraphTypes;
 }
 
 // Abstraction to represent a region of bytes in a raw data buffer. Useful for referencing raw data
diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py
index 68e54c2bc3..f15e155703 100644
--- a/backends/vulkan/serialization/vulkan_graph_builder.py
+++ b/backends/vulkan/serialization/vulkan_graph_builder.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional
+import operator
+from typing import cast, List, Optional, Union
 
 import executorch.backends.vulkan.serialization.vulkan_graph_schema as vk_graph_schema
 
@@ -15,6 +16,9 @@
 from torch.export import ExportedProgram
 from torch.fx import Node
 
+_ScalarType = Union[bool, int, float]
+_Argument = Union[Node, List[Node], TensorSpec, _ScalarType, List[_ScalarType], str]
+
 
 class VkGraphBuilder:
     def __init__(self, program: ExportedProgram) -> None:
@@ -26,28 +30,41 @@ def __init__(self, program: ExportedProgram) -> None:
         self.output_ids = []
         self.const_tensors = []
 
-        # Mapping from torch.fx.Node to VkValue id
+        # Mapping from Node to VkValue id
         self.node_to_value_ids = {}
 
     @staticmethod
     def get_vk_datatype(torch_dtype: torch.dtype) -> vk_graph_schema.VkDataType:
-        if torch_dtype == torch.float32:
-            return vk_graph_schema.VkDataType.fp32
+        if torch_dtype == torch.bool:
+            return vk_graph_schema.VkDataType.BOOL
+        elif torch_dtype == torch.uint8:
+            return vk_graph_schema.VkDataType.UINT8
+        elif torch_dtype == torch.int8:
+            return vk_graph_schema.VkDataType.INT8
+        elif torch_dtype == torch.int32:
+            return vk_graph_schema.VkDataType.INT32
+        elif torch_dtype == torch.float16:
+            return vk_graph_schema.VkDataType.FLOAT16
+        elif torch_dtype == torch.float32:
+            return vk_graph_schema.VkDataType.FLOAT32
+        # Narrowing conversion for index tensor produced by max_poolNd_with_indices.
+        elif torch_dtype == torch.int64:
+            return vk_graph_schema.VkDataType.INT32
         else:
             raise AssertionError(f"Invalid dtype for vulkan_preprocess ({torch_dtype})")
 
-    def is_constant(self, node: torch.fx.Node):
+    def is_constant(self, node: Node):
         return (
             node.name in self.program.graph_signature.inputs_to_lifted_tensor_constants
         )
 
-    def is_get_attr_node(self, node: torch.fx.Node) -> bool:
+    def is_get_attr_node(self, node: Node) -> bool:
         """
         Returns true if the given node is a get attr node for a tensor of the model
         """
-        return isinstance(node, torch.fx.Node) and node.op == "get_attr"
+        return isinstance(node, Node) and node.op == "get_attr"
 
-    def is_param_node(self, node: torch.fx.Node) -> bool:
+    def is_param_node(self, node: Node) -> bool:
         """
         Check if the given node is a parameter within the exported program
         """
@@ -58,7 +75,7 @@ def is_param_node(self, node: torch.fx.Node) -> bool:
             or self.is_constant(node)
         )
 
-    def get_constant(self, node: torch.fx.Node) -> Optional[torch.Tensor]:
+    def get_constant(self, node: Node) -> Optional[torch.Tensor]:
         """
         Returns the constant associated with the given node in the exported program.
         Returns None if the node is not a constant within the exported program
@@ -76,7 +93,7 @@ def get_constant(self, node: torch.fx.Node) -> Optional[torch.Tensor]:
 
         return None
 
-    def get_param_tensor(self, node: torch.fx.Node) -> torch.Tensor:
+    def get_param_tensor(self, node: Node) -> torch.Tensor:
         tensor = None
         if node is None:
             raise RuntimeError("node is None")
@@ -99,33 +116,20 @@ def get_param_tensor(self, node: torch.fx.Node) -> torch.Tensor:
         return tensor
 
     def maybe_add_constant_tensor(self, node: Node) -> int:
-        const_buffer_idx = -1
+        constant_id = -1
         if self.is_param_node(node):
-            const_buffer_idx = len(self.const_tensors)
+            constant_id = len(self.const_tensors)
             self.const_tensors.append(self.get_param_tensor(node))
 
-        return const_buffer_idx
-
-    def create_single_vk_value(self, node: Node) -> int:
-        constant_id = self.maybe_add_constant_tensor(node)
-
-        spec = node.meta.get("spec")
-        assert isinstance(spec, TensorSpec)
-        new_id = len(self.values)
-        if node not in self.node_to_value_ids:
-            self.node_to_value_ids[node] = new_id
-        else:
-            current_ids = self.node_to_value_ids[node]
-            if isinstance(current_ids, int):
-                current_ids = [current_ids, new_id]
-            else:
-                current_ids.append(new_id)
+        return constant_id
 
+    def create_tensor_value(self, spec: TensorSpec, constant_id: int = -1) -> int:
         # Negative id indicates that this tensor will have its own dedicated memory.
         mem_obj_id = -1
         if spec.mem_obj_id is not None:
             mem_obj_id = spec.mem_obj_id
 
+        new_id = len(self.values)
         self.values.append(
             vk_graph_schema.VkValue(
                 value=vk_graph_schema.VkTensor(
@@ -138,60 +142,158 @@ def create_single_vk_value(self, node: Node) -> int:
         )
         return new_id
 
-    def create_vk_values_for(self, node: Node):
+    def create_node_value(self, node: Node) -> int:
         spec = node.meta.get("spec")
         if isinstance(spec, TensorSpec):
-            return self.create_single_vk_value(node)
+            constant_id = self.maybe_add_constant_tensor(node)
+            new_id = self.create_tensor_value(spec, constant_id)
+            self.node_to_value_ids[node] = new_id
+            return new_id
+        elif isinstance(spec, tuple):
+            # Create a Value for each element in the tuple, wrap Values in a
+            # ValueList, and map the Node to the ValueList id.
+            new_id = self.create_value_list_value(spec)
+            self.node_to_value_ids[node] = new_id
+            return new_id
         else:
-            raise RuntimeError(
-                "Creating values for nodes with collection types is not supported yet."
+            raise RuntimeError(f"Cannot create value for spec of type {type(spec)}")
+
+    def create_value_list_value(self, arg: List[Node] | tuple) -> int:
+        self.values.append(
+            vk_graph_schema.VkValue(
+                vk_graph_schema.ValueList(
+                    items=[self.get_or_create_value_for(e) for e in arg]
+                )
+            )
+        )
+        return len(self.values) - 1
+
+    def create_scalar_value(self, scalar: _ScalarType) -> int:
+        new_id = len(self.values)
+        if isinstance(scalar, bool):
+            self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Bool(scalar)))
+        elif isinstance(scalar, int):
+            self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Int(scalar)))
+        elif isinstance(scalar, float):
+            self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Double(scalar)))
+        return new_id
+
+    def create_scalar_list_value(self, arg: List[_ScalarType]) -> int:
+        new_id = len(self.values)
+        if isinstance(arg[0], bool):
+            self.values.append(
+                vk_graph_schema.VkValue(
+                    vk_graph_schema.BoolList(items=[cast(bool, e) for e in arg])
+                )
+            )
+        elif isinstance(arg[0], int):
+            self.values.append(
+                vk_graph_schema.VkValue(
+                    vk_graph_schema.IntList(items=[cast(int, e) for e in arg])
+                )
+            )
+        elif isinstance(arg[0], float):
+            self.values.append(
+                vk_graph_schema.VkValue(
+                    vk_graph_schema.DoubleList(items=[cast(float, e) for e in arg])
+                )
             )
+        return new_id
+
+    def create_string_value(self, string: str) -> int:
+        new_id = len(self.values)
+        self.values.append(
+            vk_graph_schema.VkValue(vk_graph_schema.String(string_val=string))
+        )
+        return new_id
+
+    def get_or_create_value_for(self, arg: _Argument):
+        if isinstance(arg, Node):
+            # If the Node has already been processed, return the existing id.
+            if arg in self.node_to_value_ids:
+                return self.node_to_value_ids[arg]
+            return self.create_node_value(arg)
+        elif isinstance(arg, list) and isinstance(arg[0], Node):
+            # pyre-ignore[6]
+            return self.create_value_list_value(arg)
+        elif isinstance(arg, TensorSpec):
+            return self.create_tensor_value(arg)
+        elif isinstance(arg, _ScalarType):
+            return self.create_scalar_value(arg)
+        elif isinstance(arg, list) and isinstance(arg[0], _ScalarType):
+            # pyre-ignore[6]
+            return self.create_scalar_list_value(arg)
+        elif isinstance(arg, str):
+            return self.create_string_value(arg)
+        else:
+            raise RuntimeError(f"Cannot create value for arg of type {type(arg)}")
 
     def process_placeholder_node(self, node: Node) -> None:
-        ids = self.create_vk_values_for(node)
+        ids = self.create_node_value(node)
         if not self.is_param_node(node):
             if isinstance(ids, int):
                 self.input_ids.append(ids)
             else:
                 self.input_ids += ids
 
+    def process_getitem_node(self, node: Node) -> None:
+        # Find ValueList id from the collection node.
+        collection_node = node.all_input_nodes[0]
+        list_id = self.node_to_value_ids[collection_node]
+
+        # Extract the target Value id from ValueList.
+        valuelist_id = node.args[1]
+        value_id = self.values[list_id].value.items[valuelist_id]
+
+        # Map Node to Value id.
+        self.node_to_value_ids[node] = value_id
+
     def process_call_function_node(self, node) -> None:
-        args = []
-        # Add input nodes
-        for inp_node in node.all_input_nodes:
-            if inp_node not in self.node_to_value_ids:
-                raise AssertionError(
-                    "Cannot find input to current node in node_to_value_ids. This means "
-                    "this node is being serialized before its input which is not allowed."
-                )
-            args.append(self.node_to_value_ids[inp_node])
+        operator_call_args = []
+
+        for i, schema_arg in enumerate(node.target._schema.arguments):
+            if not schema_arg.kwarg_only and i < len(node.args):
+                function_arg = node.args[i]
+            elif schema_arg.name in node.kwargs:
+                function_arg = node.kwargs[schema_arg.name]
+            else:
+                function_arg = schema_arg.default_value
+
+            # Create a Value for each function argument. If the argument has been
+            # previously encountered, then use the existing Value id.
+            operator_call_args.append(self.get_or_create_value_for(function_arg))
+
         # Add output node
-        args.append(self.create_vk_values_for(node))
+        operator_call_args.append(self.create_node_value(node))
 
         self.chain.append(
             vk_graph_schema.OperatorCall(
                 name=node.target.__name__,
-                args=args,
+                args=operator_call_args,
             ),
         )
 
     def process_getattr_node(self, node: Node) -> None:
-        self.create_vk_values_for(node)
+        self.create_node_value(node)
 
     def process_output_node(self, node: Node) -> None:
-        if node.all_input_nodes[0] not in self.node_to_value_ids:
-            raise AssertionError(
-                "Cannot find input to output node in node_to_value_ids. This means the "
-                "output node is being serialized before its corresponding internal node "
-                "which is not allowed."
-            )
-        self.output_ids.append(self.node_to_value_ids[node.all_input_nodes[0]])
+        for out_node in node.all_input_nodes:
+            if out_node not in self.node_to_value_ids:
+                raise AssertionError(
+                    "Cannot find input to output node in node_to_value_ids. This means "
+                    "the output node is being serialized before its corresponding "
+                    "internal node which is not allowed."
+                )
+            self.output_ids.append(self.node_to_value_ids[out_node])
 
     def process_node(self, node: Node) -> None:
         if node.op == "placeholder":
             self.process_placeholder_node(node)
         elif node.op == "call_function":
-            self.process_call_function_node(node)
+            if node.target == operator.getitem:
+                self.process_getitem_node(node)
+            else:
+                self.process_call_function_node(node)
         elif node.op == "get_attr":
             self.process_getattr_node(node)
         elif node.op == "output":
diff --git a/backends/vulkan/serialization/vulkan_graph_schema.py b/backends/vulkan/serialization/vulkan_graph_schema.py
index eeb1589a2a..2edb02a910 100644
--- a/backends/vulkan/serialization/vulkan_graph_schema.py
+++ b/backends/vulkan/serialization/vulkan_graph_schema.py
@@ -12,7 +12,7 @@
 
 from dataclasses import dataclass
 from enum import IntEnum
-from typing import List
+from typing import List, Union
 
 
 @dataclass
@@ -22,7 +22,12 @@ class OperatorCall:
 
 
 class VkDataType(IntEnum):
-    fp32 = 0
+    BOOL = 0
+    UINT8 = 1
+    INT8 = 2
+    INT32 = 3
+    FLOAT16 = 4
+    FLOAT32 = 5
 
 
 @dataclass
@@ -34,13 +39,67 @@ class VkTensor:
 
 
 @dataclass
-class VkScalar:
+class Null:
     pass
 
 
+@dataclass
+class Int:
+    int_val: int
+
+
+@dataclass
+class Bool:
+    bool_val: bool
+
+
+@dataclass
+class Double:
+    double_val: float
+
+
+@dataclass
+class IntList:
+    items: List[int]
+
+
+@dataclass
+class DoubleList:
+    items: List[float]
+
+
+@dataclass
+class BoolList:
+    items: List[bool]
+
+
+@dataclass
+class ValueList:
+    items: List[int]
+
+
+@dataclass
+class String:
+    string_val: str
+
+
+GraphTypes = Union[
+    Null,
+    Int,
+    Double,
+    Bool,
+    VkTensor,
+    IntList,
+    BoolList,
+    DoubleList,
+    ValueList,
+    String,
+]
+
+
 @dataclass
 class VkValue:
-    value: VkTensor
+    value: "GraphTypes"
 
 
 @dataclass
diff --git a/backends/vulkan/serialization/vulkan_graph_serialize.py b/backends/vulkan/serialization/vulkan_graph_serialize.py
index 83a9e75f6c..37785f4752 100644
--- a/backends/vulkan/serialization/vulkan_graph_serialize.py
+++ b/backends/vulkan/serialization/vulkan_graph_serialize.py
@@ -12,7 +12,6 @@
 from dataclasses import dataclass
 from typing import ClassVar, List
 
-# pyre-ignore[21]: Could not find module `executorch.exir._serialize._bindings`.
 import pkg_resources
 import torch
 
@@ -35,7 +34,6 @@ def convert_to_flatbuffer(vk_graph: VkGraph) -> bytes:
         json_path = os.path.join(d, "schema.json")
         with open(json_path, "wb") as json_file:
             json_file.write(vk_graph_json.encode("ascii"))
-        # pyre-ignore
         _flatc_compile(d, schema_path, json_path)
         output_path = os.path.join(d, "schema.bin")
         with open(output_path, "rb") as output_file:
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index 345f18801f..1e7670d1cc 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -1,6 +1,55 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-def define_common_targets():
+def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False):
+    gen_aten_vulkan_spv_target = "//caffe2/tools:gen_aten_vulkan_spv_bin"
+    glslc_path = "//caffe2/fb/vulkan/dotslash:glslc"
+    if is_fbcode:
+        gen_aten_vulkan_spv_target = "//caffe2:gen_vulkan_spv_bin"
+        glslc_path = "//caffe2/fb/vulkan/tools:glslc"
+
+    glsl_paths = []
+
+    # TODO(ssjia): remove the need for subpath once subdir_glob is enabled in OSS
+    for target, subpath in spv_filegroups.items():
+        glsl_paths.append("$(location {})/{}".format(target, subpath))
+
+    genrule_cmd = [
+        "$(exe {})".format(gen_aten_vulkan_spv_target),
+        "--glsl-paths {}".format(" ".join(glsl_paths)),
+        "--output-path $OUT",
+        "--glslc-path=$(exe {})".format(glslc_path),
+        "--tmp-dir-path=$OUT",
+    ]
+
+    genrule_name = "gen_{}_cpp".format(name)
+    runtime.genrule(
+        name = genrule_name,
+        outs = {
+            "{}.cpp".format(name): ["spv.cpp"],
+        },
+        cmd = " ".join(genrule_cmd),
+        default_outs = ["."],
+        labels = ["uses_dotslash"],
+    )
+
+    runtime.cxx_library(
+        name = name,
+        srcs = [
+            ":{}[{}.cpp]".format(genrule_name, name),
+        ],
+        define_static_target = False,
+        # Static initialization is used to register shaders to the global shader registry,
+        # therefore link_whole must be True to make sure unused symbols are not discarded.
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True`
+        link_whole = True,
+        # Define a soname that can be used for dynamic loading in Java, Python, etc.
+        soname = "lib{}.$(ext)".format(name),
+        exported_deps = [
+            "//caffe2:torch_vulkan_api",
+        ],
+    )
+
+def define_common_targets(is_fbcode = False):
     runtime.genrule(
         name = "gen_vk_delegate_schema",
         srcs = [
@@ -38,6 +87,21 @@ def define_common_targets():
         ],
     )
 
+    runtime.filegroup(
+        name = "vulkan_graph_runtime_shaders",
+        srcs = native.glob([
+            "runtime/graph/ops/glsl/*",
+        ]),
+    )
+
+    vulkan_spv_shader_lib(
+        name = "vulkan_graph_runtime_shaderlib",
+        spv_filegroups = {
+            ":vulkan_graph_runtime_shaders": "runtime/graph/ops/glsl",
+        },
+        is_fbcode = is_fbcode,
+    )
+
     runtime.cxx_library(
         name = "vulkan_graph_runtime",
         srcs = native.glob([
@@ -53,10 +117,15 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
         exported_deps = [
-            "//caffe2:torch_vulkan_ops",
-            "//caffe2:torch_vulkan_spv",
+            ":vulkan_graph_runtime_shaderlib",
         ],
         define_static_target = False,
+        # Static initialization is used to register operators to the global operator registry,
+        # therefore link_whole must be True to make sure unused symbols are not discarded.
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True`
+        link_whole = True,
+        # Define an soname that can be used for dynamic loading in Java, Python, etc.
+        soname = "libvulkan_graph_runtime.$(ext)",
     )
 
     runtime.cxx_library(
@@ -77,6 +146,7 @@ def define_common_targets():
             ":vk_delegate_schema",
             ":vulkan_graph_runtime",
             "//executorch/runtime/backend:interface",
+            "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
         define_static_target = False,
         # VulkanBackend.cpp needs to compile with executor as whole
diff --git a/backends/vulkan/test/glsl/all_shaders.yaml b/backends/vulkan/test/glsl/all_shaders.yaml
new file mode 100644
index 0000000000..e6b4ca2cca
--- /dev/null
+++ b/backends/vulkan/test/glsl/all_shaders.yaml
@@ -0,0 +1,62 @@
+binary_op_nobroadcast__test:
+  parameter_names_with_default_values:
+    DTYPE: float
+    OPERATOR: X + Y
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: binary_add_nobroadcast__test
+      OPERATOR: X + Y
+    - NAME: binary_sub_nobroadcast__test
+      OPERATOR: X - Y
+    - NAME: binary_mul_nobroadcast__test
+      OPERATOR: X * Y
+    - NAME: binary_div_nobroadcast__test
+      OPERATOR: X / Y
+    - NAME: binary_pow_nobroadcast__test
+      OPERATOR: pow(X, Y)
+
+fill_texture__test:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: fill_texture__test
+
+image_to_nchw__test:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: CHANNELS_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: image3d_to_nchw__test_C_packed
+
+nchw_to_image__test:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: CHANNELS_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: "half"
+        SUFFIX: "half"
+      - VALUE: "float"
+        SUFFIX: "float"
+  shader_variants:
+    - NAME: nchw_to_image3d__test_C_packed
diff --git a/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl b/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl
new file mode 100644
index 0000000000..f5e5d6b4e4
--- /dev/null
+++ b/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+// clang-format off
+#define PRECISION ${PRECISION}
+
+#define OP(X, Y) ${OPERATOR}
+// clang-format on
+
+layout(std430) buffer;
+
+// clang-format off
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D image_out;
+// clang-format on
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D image_other;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
+    return;
+  }
+
+  vec4 in_texel = texelFetch(image_in, pos, 0);
+  vec4 other_texel = texelFetch(image_other, pos, 0);
+
+  imageStore(image_out, pos, OP(in_texel, other_texel));
+}
diff --git a/backends/vulkan/test/glsl/fill_texture__test.glsl b/backends/vulkan/test/glsl/fill_texture__test.glsl
new file mode 100644
index 0000000000..fafad11d49
--- /dev/null
+++ b/backends/vulkan/test/glsl/fill_texture__test.glsl
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+// clang-format off
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} uOutput;
+// clang-format on
+layout(set = 0, binding = 1) uniform PRECISION restrict Block {
+  ivec3 size;
+  int fill;
+  vec4 vals;
+} params;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, params.size))) {
+    return;
+  }
+
+  imageStore(uOutput, pos, params.vals);
+}
diff --git a/backends/vulkan/test/glsl/image_to_nchw__test.glsl b/backends/vulkan/test/glsl/image_to_nchw__test.glsl
new file mode 100644
index 0000000000..b5563b080f
--- /dev/null
+++ b/backends/vulkan/test/glsl/image_to_nchw__test.glsl
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+// clang-format off
+#define PRECISION ${PRECISION}
+// clang-format on
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in;
+layout(set = 0, binding = 1) buffer PRECISION restrict writeonly Buffer {
+  ${T[DTYPE]} data[];
+}
+buffer_out;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
+  ivec4 data;
+}
+gpu_sizes;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict CpuSizes {
+  ivec4 data;
+}
+cpu_sizes;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data);
+
+  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+    return;
+  }
+
+  const ${VEC4_T[DTYPE]} intex = texelFetch(image_in, pos, 0);
+
+  const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * (gpu_sizes.data.x * gpu_sizes.data.y);
+
+  if (coord.z < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.x] = intex.x;
+  }
+  if (coord.z + 1 < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.y] = intex.y;
+  }
+  if (coord.z + 2 < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.z] = intex.z;
+  }
+  if (coord.z + 3 < cpu_sizes.data.z) {
+    buffer_out.data[buf_indices.w] = intex.w;
+  }
+}
diff --git a/backends/vulkan/test/glsl/indexing_utils.h b/backends/vulkan/test/glsl/indexing_utils.h
new file mode 100644
index 0000000000..d3f005c1ee
--- /dev/null
+++ b/backends/vulkan/test/glsl/indexing_utils.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#define POS_TO_COORD_CHANNELS_PACKED(pos, sizes) \
+  ivec4(pos.x, pos.y, (pos.z * 4) % sizes.z, (pos.z * 4) / sizes.z)
+
+#define COORD_TO_BUFFER_IDX(coord, sizes)                  \
+  coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
+      coord.w* sizes.z* sizes.y* sizes.x;
diff --git a/backends/vulkan/test/glsl/nchw_to_image__test.glsl b/backends/vulkan/test/glsl/nchw_to_image__test.glsl
new file mode 100644
index 0000000000..1a41fd88d0
--- /dev/null
+++ b/backends/vulkan/test/glsl/nchw_to_image__test.glsl
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+// clang-format off
+#define PRECISION ${PRECISION}
+// clang-format on
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+// clang-format off
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+// clang-format on
+layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
+  ${T[DTYPE]} data[];
+}
+buffer_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
+  ivec4 data;
+}
+gpu_sizes;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict CpuSizes {
+  ivec4 data;
+}
+cpu_sizes;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 coord = POS_TO_COORD_${PACKING}(pos, gpu_sizes.data);
+
+  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+    return;
+  }
+
+  const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * (gpu_sizes.data.x * gpu_sizes.data.y);
+
+  ${T[DTYPE]} val_x = buffer_in.data[buf_indices.x];
+  ${T[DTYPE]} val_y = buffer_in.data[buf_indices.y];
+  ${T[DTYPE]} val_z = buffer_in.data[buf_indices.z];
+  ${T[DTYPE]} val_w = buffer_in.data[buf_indices.w];
+
+  ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w);
+
+  if (coord.z + 3 >= cpu_sizes.data.z) {
+    ivec4 c_ind = ivec4(coord.z) + ivec4(0, 1, 2, 3);
+    vec4 valid_c = vec4(lessThan(c_ind, ivec4(cpu_sizes.data.z)));
+    texel = texel * valid_c;
+  }
+
+  imageStore(image_out, ${GET_POS[NDIM]("pos")}, texel);
+}
diff --git a/backends/vulkan/test/glsl/test_shader.glsl b/backends/vulkan/test/glsl/test_shader.glsl
new file mode 100644
index 0000000000..39edc92cc6
--- /dev/null
+++ b/backends/vulkan/test/glsl/test_shader.glsl
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
+  ivec4 size;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 intex = texelFetch(uInput, pos, 0);
+    imageStore(
+        uOutput,
+        pos,
+        intex + 5);
+  }
+}
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 8a491497c3..d18ca74588 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -14,7 +14,7 @@
 from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
 
 from executorch.exir import EdgeProgramManager, to_edge
-from torch.export import export, ExportedProgram
+from torch.export import Dim, export, ExportedProgram
 
 ctypes.CDLL("libvulkan.so.1")
 
@@ -54,13 +54,17 @@ def lower_module_and_test_output(
         sample_inputs: Tuple[torch.Tensor],
         atol=1e-03,
         rtol=1e-01,
+        dynamic_shapes=None,
+        test_inputs=None,
     ):
         """
         Helper testing function that takes a torch.nn.Module and lowers it to Vulkan with
         the given sample inputs. It then runs the lowered module and compares its
         outputs with the outputs of the eager module.
         """
-        program: ExportedProgram = export(model, sample_inputs)
+        program: ExportedProgram = export(
+            model, sample_inputs, dynamic_shapes=dynamic_shapes
+        )
         edge_program: EdgeProgramManager = to_edge(program)
         edge_program = edge_program.to_backend(VulkanPartitioner())
 
@@ -80,6 +84,19 @@ def lower_module_and_test_output(
 
         self.assert_outputs_equal(model_output, ref_output, atol=atol, rtol=rtol)
 
+        if test_inputs is not None:
+            for test_input in test_inputs:
+                # pyre-fixme[16]: Module `pytree` has no attribute `tree_flatten`.
+                test_inputs_flattened, _ = tree_flatten(test_input)
+                model_output = executorch_module.run_method(
+                    "forward", tuple(test_inputs_flattened)
+                )
+                ref_output = model(*test_input)
+
+                self.assert_outputs_equal(
+                    model_output, ref_output, atol=atol, rtol=rtol
+                )
+
     def test_vulkan_backend_add(self):
         # This test is the simplest test by manually lowering some submodules, we can use paritioner for auto detecting lowerable parts
         class AddModule(torch.nn.Module):
@@ -93,12 +110,12 @@ def forward(self, x, y):
                 return z
 
         add_module = AddModule()
-        model_inputs = (
+        sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32),
             torch.rand(size=(2, 3), dtype=torch.float32),
         )
 
-        self.lower_module_and_test_output(add_module, model_inputs)
+        self.lower_module_and_test_output(add_module, sample_inputs)
 
     def test_vulkan_backend_internal_data(self):
         class InternalDataModule(torch.nn.Module):
@@ -107,19 +124,19 @@ def __init__(self):
                 self.weight = torch.rand(size=(2, 3), dtype=torch.float32)
 
             def forward(self, x, y):
-                z = x + y
-                z = z + x
+                z = torch.add(x, y, alpha=2)
+                z = torch.add(x, y, alpha=3.14)
                 z = z + x
                 z = z + self.weight
                 return z
 
         internal_data_module = InternalDataModule()
-        model_inputs = (
+        sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32),
             torch.rand(size=(2, 3), dtype=torch.float32),
         )
 
-        self.lower_module_and_test_output(internal_data_module, model_inputs)
+        self.lower_module_and_test_output(internal_data_module, sample_inputs)
 
     def test_vulkan_backend_sub(self):
         class SubModule(torch.nn.Module):
@@ -127,18 +144,18 @@ def __init__(self):
                 super().__init__()
 
             def forward(self, x, y):
-                z = x - y
-                z = z - x
+                z = torch.sub(x, y, alpha=2)
+                z = torch.sub(z, x, alpha=3.14)
                 z = z - x
                 return z
 
         sub_module = SubModule()
-        model_inputs = (
+        sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32),
             torch.rand(size=(2, 3), dtype=torch.float32),
         )
 
-        self.lower_module_and_test_output(sub_module, model_inputs)
+        self.lower_module_and_test_output(sub_module, sample_inputs)
 
     def test_vulkan_backend_mul(self):
         class MulModule(torch.nn.Module):
@@ -152,12 +169,12 @@ def forward(self, x, y):
                 return z
 
         mul_module = MulModule()
-        model_inputs = (
+        sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32),
             torch.rand(size=(2, 3), dtype=torch.float32),
         )
 
-        self.lower_module_and_test_output(mul_module, model_inputs)
+        self.lower_module_and_test_output(mul_module, sample_inputs)
 
     def test_vulkan_backend_div(self):
         class DivModule(torch.nn.Module):
@@ -171,12 +188,12 @@ def forward(self, x, y):
                 return z
 
         div_module = DivModule()
-        model_inputs = (
+        sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32),
             torch.rand(size=(2, 3), dtype=torch.float32),
         )
 
-        self.lower_module_and_test_output(div_module, model_inputs)
+        self.lower_module_and_test_output(div_module, sample_inputs)
 
     def test_vulkan_backend_arithmetic(self):
         class ArithmeticModule(torch.nn.Module):
@@ -192,12 +209,12 @@ def forward(self, x, y):
                 return z
 
         arithmetic_module = ArithmeticModule()
-        model_inputs = (
+        sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32),
             torch.rand(size=(2, 3), dtype=torch.float32),
         )
 
-        self.lower_module_and_test_output(arithmetic_module, model_inputs)
+        self.lower_module_and_test_output(arithmetic_module, sample_inputs)
 
     def test_vulkan_backend_floor_div(self):
         class FloorDivModule(torch.nn.Module):
@@ -209,14 +226,14 @@ def forward(self, x, y):
                 return z
 
         floor_div_module = FloorDivModule()
-        model_inputs = (
+        sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32) * 10.0,
             torch.rand(size=(2, 3), dtype=torch.float32) + 1.0,
         )
 
         # absolute tolerance is 1 because of flooring
         self.lower_module_and_test_output(
-            floor_div_module, model_inputs, atol=1.0 + 1e-03
+            floor_div_module, sample_inputs, atol=1.0 + 1e-03
         )
 
     def test_vulkan_backend_pow(self):
@@ -229,12 +246,12 @@ def forward(self, x, y):
                 return z
 
         pow_module = PowModule()
-        model_inputs = (
+        sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32),
             torch.rand(size=(2, 3), dtype=torch.float32),
         )
 
-        self.lower_module_and_test_output(pow_module, model_inputs)
+        self.lower_module_and_test_output(pow_module, sample_inputs)
 
     def test_vulkan_backend_partial(self):
         class SimpleModel(torch.nn.Module):
@@ -248,6 +265,41 @@ def forward(self, x):
                 return self.linear(x + self.offset_1) - self.offset_2
 
         model = SimpleModel()
-        model_inputs = (torch.rand(size=(2, 10), dtype=torch.float32),)
+        sample_inputs = (torch.rand(size=(2, 10), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(model, sample_inputs)
+
+    def test_vulkan_backend_partial_dynamic_shapes(self):
+        class SimpleModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.branch1 = torch.nn.Sequential(
+                    torch.nn.Linear(64, 64), torch.nn.ReLU()
+                )
+                self.branch2 = torch.nn.Sequential(
+                    torch.nn.Linear(128, 64), torch.nn.ReLU()
+                )
+                self.buffer_1 = torch.ones((1, 64)) * 0.5
+                self.buffer_2 = torch.ones((1, 64)) * 1.4
+
+            def forward(self, x1, x2):
+                out1 = self.branch1(x1)
+                out2 = self.branch2(x2)
+                return (out1 + self.buffer_1 + out2) * self.buffer_2
+
+        model = SimpleModel()
+        sample_inputs = (torch.randn(32, 64), torch.randn(32, 128))
+        batch = Dim("batch", max=32)
+        dynamic_shapes = {"x1": {0: batch}, "x2": {0: batch}}
+
+        test_inputs = [
+            (torch.randn(15, 64), torch.randn(15, 128)),
+            (torch.randn(6, 64), torch.randn(6, 128)),
+            (torch.randn(30, 64), torch.randn(30, 128)),
+            (torch.randn(20, 64), torch.randn(20, 128)),
+            (torch.randn(19, 64), torch.randn(19, 128)),
+        ]
 
-        self.lower_module_and_test_output(model, model_inputs)
+        self.lower_module_and_test_output(
+            model, sample_inputs, dynamic_shapes=dynamic_shapes, test_inputs=test_inputs
+        )
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
new file mode 100644
index 0000000000..45b91ce99b
--- /dev/null
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/test/utils/test_utils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+//
+// Operator Recording Functions
+//
+
+void record_nchw_to_buffer_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_dst.gpu_numel());
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {32u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_dst.get_cpu_buffer_metadata());
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      VK_KERNEL(buffer_to_buffer),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      v_dst.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_dst.buffer_metadata(),
+      src_buffer,
+      cpu_buffer_metadata.buffer());
+}
+
+bool record_buffer_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_src.numel());
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {4u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_src.get_cpu_buffer_metadata());
+  api::PipelineBarrier pipeline_barrier{};
+
+  return context->submit_compute_job(
+      VK_KERNEL(buffer_to_buffer),
+      pipeline_barrier,
+      global_size,
+      local_size,
+      VK_NULL_HANDLE,
+      dst_buffer,
+      cpu_buffer_metadata.buffer(),
+      v_src.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_src.buffer_metadata());
+}
+
+void record_nchw_to_image_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst) {
+  api::PipelineBarrier pipeline_barrier{};
+  api::ShaderInfo compute_shader =
+      VK_KERNEL(nchw_to_image3d__test_C_packed_half);
+  if (v_dst.image().format() == VK_FORMAT_R32G32B32A32_SFLOAT) {
+    compute_shader = VK_KERNEL(nchw_to_image3d__test_C_packed_float);
+  }
+  context->submit_compute_job(
+      compute_shader,
+      pipeline_barrier,
+      v_dst.virtual_extents(),
+      adaptive_work_group_size(v_dst.virtual_extents()),
+      VK_NULL_HANDLE,
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      src_buffer,
+      v_dst.gpu_sizes_ubo()->buffer(),
+      v_dst.cpu_sizes_ubo()->buffer());
+}
+
+void record_image_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer) {
+  api::ShaderInfo compute_shader =
+      VK_KERNEL(image3d_to_nchw__test_C_packed_half);
+  if (v_src.image().format() == VK_FORMAT_R32G32B32A32_SFLOAT) {
+    compute_shader = VK_KERNEL(image3d_to_nchw__test_C_packed_float);
+  }
+  api::PipelineBarrier pipeline_barrier{};
+  context->submit_compute_job(
+      compute_shader,
+      pipeline_barrier,
+      v_src.virtual_extents(),
+      adaptive_work_group_size(v_src.virtual_extents()),
+      VK_NULL_HANDLE,
+      v_src.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      dst_buffer,
+      v_src.gpu_sizes_ubo()->buffer(),
+      v_src.cpu_sizes_ubo()->buffer());
+}
+
+void record_binary_op(
+    api::Context* const context,
+    const std::string& op_name,
+    vTensor& v_in1,
+    vTensor& v_in2,
+    vTensor& v_dst) {
+  std::stringstream kernel_name;
+  kernel_name << "binary_" << op_name << "_nobroadcast__test";
+  apply_dtype_suffix(kernel_name, v_dst);
+
+  api::PipelineBarrier pipeline_barrier{};
+  context->submit_compute_job(
+      VK_KERNEL_FROM_STR(kernel_name.str()),
+      pipeline_barrier,
+      v_dst.virtual_extents(),
+      adaptive_work_group_size(v_dst.virtual_extents()),
+      VK_NULL_HANDLE,
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_in1.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      v_in2.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      v_dst.extents_ubo()->buffer());
+}
+
+void execute_and_check_add(
+    vTensor& a,
+    vTensor& b,
+    vTensor& c,
+    float a_val,
+    float b_val) {
+  // Add shader kernel
+  api::ShaderInfo kernel = VK_KERNEL(binary_add_nobroadcast__test_half);
+  if (c.image().format() == VK_FORMAT_R32G32B32A32_SFLOAT) {
+    kernel = VK_KERNEL(nchw_to_image3d__test_C_packed_float);
+  }
+
+  // Fill input tensors
+  fill_vtensor(a, a_val);
+  fill_vtensor(b, b_val);
+
+  // a + b = c
+  record_binary_op(api::context(), "add", a, b, c);
+
+  // Extract output tensor
+  std::vector<float> data_out = extract_vtensor(c);
+
+  // Check output
+  for (const auto& d : data_out) {
+    EXPECT_TRUE(d == (a_val + b_val));
+  }
+}
+
+//
+// Input & Output Utilities
+//
+
+void fill_vtensor(vTensor& vten, std::vector<float>& data) {
+  api::StorageBuffer staging_buffer(api::context(), api::kFloat, data.size());
+
+  copy_ptr_to_staging(data.data(), staging_buffer, vten.gpu_nbytes());
+
+  if (vten.storage_type() == api::StorageType::BUFFER) {
+    record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
+  } else {
+    record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
+  }
+}
+
+void fill_vtensor(ComputeGraph& graph, const IOValueRef idx, float val) {
+  std::vector<float> data(graph.get_val(idx.value).toTensor().gpu_numel());
+  std::fill(data.begin(), data.end(), val);
+
+  graph.copy_into_staging(idx.staging, data.data(), data.size());
+}
+
+void extract_vtensor(vTensor& vten, std::vector<float>& data) {
+  api::StorageBuffer staging_buffer(
+      api::context(), api::kFloat, vten.gpu_numel());
+
+  if (vten.storage_type() == api::StorageType::BUFFER) {
+    record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
+  } else {
+    record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
+  }
+
+  api::VulkanFence fence = api::context()->fences().get_fence();
+  api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
+  fence.wait();
+
+  copy_staging_to_ptr(staging_buffer, data.data(), vten.gpu_nbytes());
+}
+
+//
+// Context Management
+//
+
+void submit_to_gpu() {
+  api::VulkanFence fence = api::context()->fences().get_fence();
+  api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
+  fence.wait();
+}
+
+api::MemoryAllocation allocate_memory_for(const vTensor& vten) {
+  return api::context()->adapter_ptr()->vma().create_allocation(
+      vten.get_memory_requirements(), vten.get_allocation_create_info());
+}
+
+VmaTotalStatistics get_vma_stats() {
+  return api::context()->adapter_ptr()->vma().get_memory_statistics();
+}
+
+size_t get_vma_allocation_count() {
+  return get_vma_stats().total.statistics.allocationCount;
+}
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
new file mode 100644
index 0000000000..8c946107ef
--- /dev/null
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <ATen/native/vulkan/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+
+using namespace at::native::vulkan;
+
+#define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \
+  vTensor(                                           \
+      api::context(),                                \
+      sizes,                                         \
+      api::kFloat,                                   \
+      api::StorageType::TEXTURE_3D,                  \
+      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,  \
+      allocate_memory);
+
+#define CREATE_FLOAT_BUFFER(sizes, allocate_memory) \
+  vTensor(                                          \
+      api::context(),                               \
+      sizes,                                        \
+      api::kFloat,                                  \
+      api::StorageType::BUFFER,                     \
+      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, \
+      allocate_memory);
+
+#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \
+  api::StorageBuffer staging_buffer_##tensor(               \
+      api::context(), api::kFloat, tensor.gpu_numel());     \
+  record_nchw_to_image_op(                                  \
+      api::context(), staging_buffer_##tensor.buffer(), tensor);
+
+#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \
+  api::StorageBuffer staging_buffer_##tensor(                 \
+      api::context(), api::kFloat, tensor.gpu_numel());       \
+  record_image_to_nchw_op(                                    \
+      api::context(), tensor, staging_buffer_##tensor.buffer());
+
+//
+// Operator Recording
+//
+
+void record_nchw_to_buffer_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst);
+
+bool record_buffer_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer);
+
+void record_nchw_to_image_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst);
+
+void record_image_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer);
+
+void record_binary_op(
+    api::Context* const context,
+    const std::string& op_name,
+    vTensor& v_in1,
+    vTensor& v_in2,
+    vTensor& v_dst);
+
+void execute_and_check_add(
+    vTensor& a,
+    vTensor& b,
+    vTensor& c,
+    float a_val,
+    float b_val);
+
+//
+// Input & Output Utilities
+//
+
+inline void
+fill_staging(api::StorageBuffer& staging, float val, int numel = -1) {
+  if (numel < 0) {
+    numel = staging.numel();
+  }
+  std::vector<float> data(numel);
+  std::fill(data.begin(), data.end(), val);
+  copy_ptr_to_staging(data.data(), staging, sizeof(float) * numel);
+}
+
+void fill_vtensor(vTensor& vten, std::vector<float>& data);
+
+inline void fill_vtensor(vTensor& vten, float val) {
+  std::vector<float> vten_data(vten.gpu_numel());
+  std::fill(vten_data.begin(), vten_data.end(), val);
+
+  fill_vtensor(vten, vten_data);
+}
+
+void fill_vtensor(ComputeGraph& graph, const IOValueRef idx, float val);
+
+void extract_vtensor(vTensor& vten, std::vector<float>& data);
+
+inline std::vector<float> extract_vtensor(vTensor& vten) {
+  std::vector<float> data_out(vten.gpu_numel());
+  extract_vtensor(vten, data_out);
+  return data_out;
+}
+
+inline void
+check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) {
+  if (numel < 0) {
+    numel = staging.numel();
+  }
+  std::vector<float> data(numel);
+  copy_staging_to_ptr(staging, data.data(), sizeof(float) * numel);
+
+  for (const auto& d : data) {
+    EXPECT_TRUE(d == val);
+  }
+}
+
+//
+// Context Management
+//
+
+void submit_to_gpu();
+
+api::MemoryAllocation allocate_memory_for(const vTensor& vten);
+
+VmaTotalStatistics get_vma_stats();
+
+size_t get_vma_allocation_count();
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 0692d8c709..eaaca78749 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -10,136 +10,18 @@
 
 #include <ATen/native/vulkan/api/api.h>
 
-#include <ATen/native/vulkan/impl/Common.h>
-#include <ATen/native/vulkan/impl/Packing.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
-using namespace at::native::vulkan;
-
-//
-// Utilities
-//
-
-#define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \
-  vTensor(                                           \
-      api::context(),                                \
-      sizes,                                         \
-      api::kFloat,                                   \
-      api::StorageType::TEXTURE_3D,                  \
-      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,  \
-      allocate_memory);
-
-#define CREATE_FLOAT_BUFFER(sizes, allocate_memory) \
-  vTensor(                                          \
-      api::context(),                               \
-      sizes,                                        \
-      api::kFloat,                                  \
-      api::StorageType::BUFFER,                     \
-      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, \
-      allocate_memory);
-
-void fill_vtensor(vTensor& vten, std::vector<float>& data) {
-  api::StorageBuffer staging_buffer(api::context(), api::kFloat, data.size());
-
-  copy_ptr_to_staging(data.data(), staging_buffer, vten.gpu_nbytes());
-
-  if (vten.storage_type() == api::StorageType::BUFFER) {
-    packing::record_nchw_to_buffer_op(
-        api::context(), staging_buffer.buffer(), vten, {}, VK_NULL_HANDLE);
-  } else {
-    api::ShaderInfo compute_shader = packing::get_nchw_to_image_shader(vten);
-    packing::record_nchw_to_image_op(
-        api::context(),
-        compute_shader,
-        staging_buffer.buffer(),
-        vten,
-        {},
-        VK_NULL_HANDLE);
-  }
-}
-
-void fill_vtensor(ComputeGraph& graph, const IOValueRef idx, float val) {
-  std::vector<float> data(graph.get_val(idx.value).toTensor().gpu_numel());
-  std::fill(data.begin(), data.end(), val);
-
-  graph.copy_into_staging(idx.staging, data.data(), data.size());
-}
-
-void extract_vtensor(vTensor& vten, std::vector<float>& data) {
-  api::StorageBuffer staging_buffer(
-      api::context(), api::kFloat, vten.gpu_numel());
-
-  if (vten.storage_type() == api::StorageType::BUFFER) {
-    packing::record_buffer_to_nchw_op(
-        api::context(), vten, staging_buffer.buffer(), {}, VK_NULL_HANDLE);
-  } else {
-    api::ShaderInfo compute_shader = packing::get_image_to_nchw_shader(vten);
-    packing::record_image_to_nchw_op(
-        api::context(),
-        compute_shader,
-        vten,
-        staging_buffer.buffer(),
-        {},
-        VK_NULL_HANDLE);
-  }
-
-  api::VulkanFence fence = api::context()->fences().get_fence();
-  api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
-  fence.wait();
-
-  copy_staging_to_ptr(staging_buffer, data.data(), vten.gpu_nbytes());
-}
-
-api::MemoryAllocation allocate_memory_for(const vTensor& vten) {
-  return api::context()->adapter_ptr()->vma().create_allocation(
-      vten.get_memory_requirements(), vten.get_allocation_create_info());
-}
-
-VmaTotalStatistics get_vma_stats() {
-  return api::context()->adapter_ptr()->vma().get_memory_statistics();
-}
-
-size_t get_vma_allocation_count() {
-  return get_vma_stats().total.statistics.allocationCount;
-}
-
-GraphConfig generate_graph_config() {
-  const uint32_t submit_frequency = UINT32_MAX;
-
-  const api::CommandPoolConfig cmd_config{
-      4u, // cmdPoolInitialSize
-      2u, // cmdPoolBatchSize
-  };
-
-  const api::DescriptorPoolConfig descriptor_pool_config{
-      1024u, // descriptorPoolMaxSets
-      1024u, // descriptorUniformBufferCount
-      1024u, // descriptorStorageBufferCount
-      1024u, // descriptorCombinedSamplerCount
-      1024u, // descriptorStorageImageCount
-      32u, // descriptorPileSizes
-  };
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 
-  const api::QueryPoolConfig query_pool_config{};
+#include <executorch/backends/vulkan/test/utils/test_utils.h>
 
-  const api::ContextConfig context_config{
-      submit_frequency, // cmdSubmitFrequency
-      cmd_config, // cmdPoolConfig
-      descriptor_pool_config, // descriptorPoolConfig
-      query_pool_config, // queryPoolConfig
-  };
-
-  const GraphConfig graph_config{
-      context_config,
-  };
-
-  return graph_config;
-}
+using namespace at::native::vulkan;
 
 //
-// Test Wrapper
+// Compute API Tests
 //
 
 class VulkanComputeAPITest : public ::testing::Test {
@@ -157,63 +39,67 @@ class VulkanComputeAPITest : public ::testing::Test {
   }
 };
 
-//
-// Compute API Tests
-//
+TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) {
+  // Try to get shader from custom shader library
+  const api::ShaderInfo& kernel = VK_KERNEL(test_shader);
 
-TEST_F(VulkanComputeAPITest, buffer_copy_sanity_check) {
-  // Simple test that copies data into a and reads from a
-  std::vector<int64_t> sizes = {4, 4, 1};
-  vTensor a = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory = */ true);
-
-  // Input data
-  std::vector<float> data_in(a.gpu_numel());
-  std::fill(data_in.begin(), data_in.end(), 2.524f);
+  EXPECT_TRUE(kernel.kernel_name == "test_shader");
+}
 
-  // Fill input tensor
-  fill_vtensor(a, data_in);
+TEST_F(VulkanComputeAPITest, update_params_between_submit) {
+  api::context()->set_cmd(/*reusable = */ true);
+  std::vector<int64_t> sizes = {4, 4, 2};
+  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
 
-  // Read back data
-  std::vector<float> data_out(a.gpu_numel());
-  extract_vtensor(a, data_out);
+  std::stringstream kernel_name;
+  kernel_name << "fill_texture__test";
+  apply_dtype_suffix(kernel_name, a);
 
-  // Check output
-  for (const auto& d : data_out) {
-    EXPECT_TRUE(d == 2.524f);
-  }
-}
+  struct Params final {
+    api::utils::ivec3 size;
+    int32_t fill;
+    api::utils::vec4 values;
+  };
 
-TEST_F(VulkanComputeAPITest, buffer_deferred_allocation_test) {
-  // Same as buffer_copy_sanity_check, but defers memory allocation
+  Params block{
+      {2, 4, 1},
+      0,
+      {5.0, 5.0, 5.0, 5.0},
+  };
 
-  std::vector<int64_t> sizes = {4, 4, 1};
-  vTensor a = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory = */ false);
+  api::UniformParamsBuffer params(api::context(), block);
 
-  // For buffer storage, a small uniform buffer is allocated containing size and
-  // stride data, which is why the check is for 1 allocation below.
-  EXPECT_TRUE(get_vma_allocation_count() == 1);
+  {
+    api::PipelineBarrier pipeline_barrier{};
+    api::context()->submit_compute_job(
+        VK_KERNEL_FROM_STR(kernel_name.str()),
+        pipeline_barrier,
+        {4, 4, 4},
+        {4, 4, 4},
+        VK_NULL_HANDLE,
+        a.image(
+            pipeline_barrier,
+            api::PipelineStage::COMPUTE,
+            api::MemoryAccessType::WRITE),
+        params.buffer());
+  }
 
-  // Input data
-  std::vector<float> data_in(a.gpu_numel());
-  std::fill(data_in.begin(), data_in.end(), 1.234f);
+  api::StorageBuffer staging_buffer(api::context(), api::kFloat, a.gpu_numel());
+  record_image_to_nchw_op(api::context(), a, staging_buffer.buffer());
 
-  // Allocate memory at the last possible opportunity
-  api::MemoryAllocation a_mem = allocate_memory_for(a);
-  a.buffer().bind_allocation(a_mem);
+  submit_to_gpu();
+  check_staging_buffer(staging_buffer, 5.0f);
 
-  EXPECT_TRUE(get_vma_allocation_count() == 2);
-
-  // Fill input tensor
-  fill_vtensor(a, data_in);
+  Params new_block{
+      {2, 4, 1},
+      0,
+      {4.0, 4.0, 4.0, 4.0},
+  };
 
-  // Read back data
-  std::vector<float> data_out(a.gpu_numel());
-  extract_vtensor(a, data_out);
+  params.update(new_block);
 
-  // Check output
-  for (const auto& d : data_out) {
-    EXPECT_TRUE(d == 1.234f);
-  }
+  submit_to_gpu();
+  check_staging_buffer(staging_buffer, 4.0f);
 }
 
 TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
@@ -224,25 +110,15 @@ TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
   vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
   vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
 
-  // Input data
-  std::vector<float> data_a(a.gpu_numel());
-  std::fill(data_a.begin(), data_a.end(), 2.5f);
-  std::vector<float> data_b(b.gpu_numel());
-  std::fill(data_b.begin(), data_b.end(), 1.5f);
-
-  // Add shader kernel
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
-
   // Fill input tensors
-  fill_vtensor(a, data_a);
-  fill_vtensor(b, data_b);
+  fill_vtensor(a, 2.5f);
+  fill_vtensor(b, 1.5f);
 
   // a + b -> c
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_binary_op(api::context(), "add", a, b, c);
 
   // Extract output tensor
-  std::vector<float> data_out(c.gpu_numel());
-  extract_vtensor(c, data_out);
+  std::vector<float> data_out = extract_vtensor(c);
 
   // Check output
   for (const auto& d : data_out) {
@@ -267,8 +143,6 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   std::vector<float> data_b(b.gpu_numel());
   std::fill(data_b.begin(), data_b.end(), 1.5f);
 
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
-
   // Allocate memory at the last possible opportunity
   api::MemoryAllocation a_mem = allocate_memory_for(a);
   a.image().bind_allocation(a_mem);
@@ -283,7 +157,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_binary_op(api::context(), "add", a, b, c);
 
   std::vector<float> data_c(c.gpu_numel());
   extract_vtensor(c, data_c);
@@ -332,21 +206,18 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   std::vector<float> data_d(b.gpu_numel());
   std::fill(data_d.begin(), data_d.end(), 1.0f);
 
-  // Get shader kernel for add
-  api::ShaderInfo kernel = arithmetic::get_shader(arithmetic::OpType::ADD);
-
   // First, fill a and b with data
   fill_vtensor(a, data_a);
   fill_vtensor(b, data_b);
 
   // a + b -> c
-  arithmetic::record_op(api::context(), kernel, a, b, c, 1.0f);
+  record_binary_op(api::context(), "add", a, b, c);
 
   // Now d can be filled with data
   fill_vtensor(d, data_d);
 
   // c + d -> e
-  arithmetic::record_op(api::context(), kernel, c, d, e, 1.0f);
+  record_binary_op(api::context(), "add", c, d, e);
 
   // Extract data from e
   std::vector<float> data_e(e.gpu_numel());
@@ -408,6 +279,103 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
   EXPECT_THROW(fill_vtensor(a, data_a), api::Error);
 }
 
+TEST_F(VulkanComputeAPITest, tensor_reallocation_test) {
+  std::vector<int64_t> sizes = {4, 4, 1};
+  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
+  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
+  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
+
+  execute_and_check_add(a, b, c, 3.0f, 5.0f);
+
+  // Redo with new sizes
+  std::vector<int64_t> new_sizes = {4, 6, 3};
+  a.reallocate(new_sizes);
+  b.reallocate(new_sizes);
+  c.reallocate(new_sizes);
+
+  // Flush everything
+  api::context()->flush();
+
+  execute_and_check_add(a, b, c, 12.0f, 10.0f);
+}
+
+TEST_F(
+    VulkanComputeAPITest,
+    tensor_reallocation_with_deferred_allocation_test) {
+  std::vector<int64_t> sizes = {8, 8, 8};
+  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
+  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
+  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
+
+  api::MemoryAllocation a_mem = allocate_memory_for(a);
+  a.image().bind_allocation(a_mem);
+  api::MemoryAllocation b_mem = allocate_memory_for(b);
+  b.image().bind_allocation(b_mem);
+  api::MemoryAllocation c_mem = allocate_memory_for(c);
+  c.image().bind_allocation(c_mem);
+
+  execute_and_check_add(a, b, c, 4.0f, 8.0f);
+
+  std::vector<std::vector<int64_t>> new_sizes_list = {
+      {4, 3, 5}, {4, 1, 7}, {8, 3, 2}, {8, 7, 2}};
+
+  for (auto& new_sizes : new_sizes_list) {
+    // Redo with new sizes
+    a.reallocate(new_sizes);
+    b.reallocate(new_sizes);
+    c.reallocate(new_sizes);
+
+    // Flush everything
+    api::context()->flush();
+
+    a.image().bind_allocation(a_mem);
+    b.image().bind_allocation(b_mem);
+    c.image().bind_allocation(c_mem);
+
+    execute_and_check_add(
+        a, b, c, float(new_sizes[1] + 4.5f), float(new_sizes[2] + 13.0f));
+  }
+}
+
+TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
+  api::context()->set_cmd(/*reusable = */ true);
+  std::vector<int64_t> sizes = {8, 12, 12};
+  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
+  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
+  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
+
+  DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(a)
+  DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(b)
+
+  fill_staging(staging_buffer_a, 11.5f);
+  fill_staging(staging_buffer_b, 12.5f);
+
+  record_binary_op(api::context(), "add", a, b, c);
+
+  DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(c)
+
+  submit_to_gpu();
+  check_staging_buffer(staging_buffer_c, 24.0f);
+
+  std::vector<std::vector<int64_t>> new_sizes_list = {
+      {4, 2, 4}, {4, 3, 6}, {8, 12, 12}, {8, 1, 1}, {8, 11, 10}};
+
+  for (auto& new_sizes : new_sizes_list) {
+    a.virtual_resize(new_sizes);
+    b.virtual_resize(new_sizes);
+    c.virtual_resize(new_sizes);
+
+    fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.gpu_numel());
+    fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.gpu_numel());
+
+    submit_to_gpu();
+    check_staging_buffer(
+        staging_buffer_c,
+        float(new_sizes[1] + new_sizes[2] + 56.5f),
+        c.gpu_numel());
+  }
+}
+
 //
 // Compute Graph Tests
 //
@@ -417,12 +385,66 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
       graph.get_val(name.value).toTensor().gpu_numel()); \
   graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size());
 
+TEST(VulkanComputeGraphTest, test_values_scalars) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  ValueRef idx;
+
+  idx = graph.add_scalar<int64_t>(4);
+  EXPECT_TRUE(graph.get_val(idx).toInt() == 4);
+
+  idx = graph.add_scalar<double>(5.5f);
+  EXPECT_TRUE(graph.get_val(idx).toDouble() == 5.5f);
+}
+
+TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  ValueRef idx = graph.add_scalar_list<int64_t>({1, 2, 3, 4});
+  std::vector<int64_t>& arr = graph.get_val(idx).toIntList();
+  EXPECT_TRUE(arr.size() == 4);
+  for (int i = 0; i < 4; i++) {
+    EXPECT_TRUE(arr[i] == i + 1);
+  }
+}
+
+TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  ValueRef idx;
+  {
+    std::vector<double> data = {5.0, 4.0, 3.0, 2.0, 1.0};
+    idx = graph.add_scalar_list(std::move(data));
+  }
+  std::vector<double>& arr = graph.get_val(idx).toDoubleList();
+  EXPECT_TRUE(arr.size() == 5);
+  for (int i = 0; i < 5; i++) {
+    EXPECT_TRUE(arr[i] == (5 - i));
+  }
+}
+
+TEST(VulkanComputeGraphTest, test_values_string) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  ValueRef idx;
+  {
+    std::string data = "hello, world";
+    idx = graph.add_string(std::move(data));
+  }
+  std::string& stored = graph.get_val(idx).toString();
+  EXPECT_TRUE(stored == "hello, world");
+}
+
 TEST(VulkanComputeGraphTest, test_simple_graph) {
-  GraphConfig config = generate_graph_config();
+  GraphConfig config;
   ComputeGraph graph(config);
 
-  std::vector<int64_t> size_big = {4, 4, 4};
-  std::vector<int64_t> size_small = {4, 4, 1};
+  std::vector<int64_t> size_big = {8, 64, 124};
+  std::vector<int64_t> size_small = {8, 1, 124};
 
   // Build graph
 
@@ -431,10 +453,14 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
 
   IOValueRef out = {};
 
-  out.value = add_arithmetic_node(graph, a.value, b.value, 1.0, VK_KERNEL(add));
+  out.value = graph.add_tensor(size_big, api::kFloat);
+
+  auto addFn = VK_GET_OP_FN("aten.add.Tensor");
+  addFn(graph, {a.value, b.value, kDummyValueRef, out.value});
 
   out.staging = graph.set_output_tensor(out.value);
 
+  graph.prepare();
   graph.encode_execute();
 
   // Run graph
@@ -464,11 +490,11 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
   ValueRef name = graph.add_tensorref(sizes, api::kFloat, data_##name.data());
 
 TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
-  GraphConfig config = generate_graph_config();
+  GraphConfig config;
   ComputeGraph graph(config);
 
-  std::vector<int64_t> size_big = {4, 4, 4};
-  std::vector<int64_t> size_small = {4, 4, 1};
+  std::vector<int64_t> size_big = {8, 73, 62};
+  std::vector<int64_t> size_small = {8, 73, 1};
 
   CREATE_WEIGHT_TENSOR(w1, size_small, 3.5f);
   CREATE_WEIGHT_TENSOR(w2, size_small, 3.0f);
@@ -477,13 +503,21 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
 
   IOValueRef a = graph.add_input_tensor(size_big, api::kFloat);
 
-  ValueRef c = add_arithmetic_node(graph, a.value, w1, 1.0, VK_KERNEL(add));
-  ValueRef e = add_arithmetic_node(graph, c, w2, 1.0, VK_KERNEL(mul));
+  ValueRef c = graph.add_tensor(size_big, api::kFloat);
+  ValueRef e = graph.add_tensor(size_big, api::kFloat);
+
+  auto addFn = VK_GET_OP_FN("aten.add.Tensor");
+  addFn(graph, {a.value, w1, kDummyValueRef, c});
+
+  auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
+  mulFn(graph, {c, w2, e});
 
   IOValueRef out = {};
   out.value = e;
   out.staging = graph.set_output_tensor(out.value);
 
+  graph.prepare();
+
   graph.encode_prepack();
   graph.prepack();
 
@@ -508,12 +542,12 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
   }
 }
 
-TEST(VulkanComputeGraphTest, test_simple_shared_objects) {
-  GraphConfig config = generate_graph_config();
+TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
+  GraphConfig config;
   ComputeGraph graph(config);
 
-  std::vector<int64_t> size_big = {4, 4, 4};
-  std::vector<int64_t> size_small = {4, 4, 1};
+  std::vector<int64_t> size_big = {12, 64, 64};
+  std::vector<int64_t> size_small = {12, 64, 64};
 
   // Build graph
 
@@ -526,59 +560,70 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) {
       api::kFloat,
       /*shared_object_idx = */ 4);
 
-  // Allocation count will be 2:
-  // 1 staging buffer for each input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 2);
+  // Allocation count will be 6:
+  // 4: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for each staging shader
+  // 2: staging buffer for each input tensor
+  EXPECT_TRUE(get_vma_allocation_count() == 6);
 
-  ValueRef c = add_arithmetic_node(
-      graph,
-      a.value,
-      b.value,
-      1.0,
-      VK_KERNEL(add),
+  ValueRef c = graph.add_tensor(
+      size_big,
+      api::kFloat,
       /*shared_object_idx = */ 6);
 
+  auto addFn = VK_GET_OP_FN("aten.add.Tensor");
+  addFn(graph, {a.value, b.value, kDummyValueRef, c});
+
   IOValueRef d = graph.add_input_tensor(
       size_small,
       api::kFloat,
       /*shared_object_idx = */ 2);
 
-  // Allocation count will be 4, two are new:
-  // 1 uniform buffer for arithmetic shader params
-  // 1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 4);
-
-  ValueRef e = add_arithmetic_node(
-      graph,
-      c,
-      d.value,
-      1.0,
-      VK_KERNEL(mul),
+  // Allocation count will be 11, 5 are new:
+  // 2: out.gpu_sizes_ubo(), alpha UBO for arithmetic shader
+  // 2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() uniform buffer for staging shader
+  // 1: staging buffer for the input tensor
+  EXPECT_TRUE(get_vma_allocation_count() == 11);
+
+  ValueRef e = graph.add_tensor(
+      size_big,
+      api::kFloat,
       /*shared_object_idx = */ 4);
 
+  auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
+  mulFn(graph, {c, d.value, e});
+
   IOValueRef out = {};
   out.value = e;
   out.staging = graph.set_output_tensor(out.value);
 
-  // Allocation count will be 6, three are new:
-  // 1 uniform buffer for arithmetic shader params
+  // Allocation count will be 15, 4 are new:
+  // 1: alpha UBO for arithmetic shader
+  // 2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for staging shader
   // 1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 6);
+  EXPECT_TRUE(get_vma_allocation_count() == 15);
 
+  graph.prepare();
   graph.encode_execute();
 
-  // Allocation count will be 13:
-  // 4 staging buffers for each I/O tensor
-  // 6 uniform buffers to store params for each shader dispatch
-  // 3 shared objects to back tensor memory
-  EXPECT_TRUE(get_vma_allocation_count() == 13);
+  // Allocation count will be 18, 3 are new:
+  // 3: shared memory allocations for tensors
+  EXPECT_TRUE(get_vma_allocation_count() == 18);
 
   // Run graph
 
-  for (float i = 4.0f; i < 30.0f; i += 7.0f) {
-    float val_a = i + 2.0f;
-    float val_b = i + 1.5f;
-    float val_d = i + 1.0f;
+  std::vector<std::vector<int64_t>> new_sizes_list = {
+      {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}};
+
+  for (auto& new_sizes : new_sizes_list) {
+    graph.get_val(a.value).toTensor().virtual_resize(new_sizes);
+    graph.get_val(b.value).toTensor().virtual_resize(new_sizes);
+    graph.get_val(c).toTensor().virtual_resize(new_sizes);
+    graph.get_val(d.value).toTensor().virtual_resize(new_sizes);
+    graph.get_val(e).toTensor().virtual_resize(new_sizes);
+
+    float val_a = new_sizes[1] + 4.0f;
+    float val_b = new_sizes[2] + 1.5f;
+    float val_d = new_sizes[0] + 2.0f;
     float val_out = (val_a + val_b) * val_d;
 
     fill_vtensor(graph, a, val_a);
@@ -595,4 +640,91 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) {
       EXPECT_TRUE(val == val_out);
     }
   }
+
+  std::vector<std::vector<int64_t>> new_sizes_list_2 = {
+      {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}};
+
+  for (auto& new_sizes : new_sizes_list_2) {
+    graph.resize_input(0, new_sizes);
+    graph.resize_input(1, new_sizes);
+    graph.resize_input(2, new_sizes);
+    graph.propagate_resize();
+
+    // Check output shape
+    EXPECT_TRUE(graph.get_val(out.value).toTensor().sizes() == new_sizes);
+
+    float val_a = new_sizes[1] + 6.0f;
+    float val_b = new_sizes[2] + 2.5f;
+    float val_d = new_sizes[0] + 4.0f;
+    float val_out = (val_a + val_b) * val_d;
+
+    fill_vtensor(graph, a, val_a);
+    fill_vtensor(graph, b, val_b);
+    fill_vtensor(graph, d, val_d);
+
+    // Execute graph
+    graph.execute();
+
+    EXTRACT_TENSOR(out);
+
+    // Sanity check that the values are correct
+    for (const auto& val : data_out) {
+      ASSERT_TRUE(val == val_out);
+    }
+  }
+}
+
+TEST(VulkanComputeGraphTest, test_large_graph) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  int64_t input_w = 256;
+  int64_t input_h = 256;
+  int64_t input_c = 8;
+
+  std::vector<int64_t> size_big = {input_c, input_h, input_w};
+  std::vector<int64_t> size_small = {input_c, input_h, 1};
+
+  // Build graph
+
+  IOValueRef a = graph.add_input_tensor(size_big, api::kFloat, 2);
+  IOValueRef b = graph.add_input_tensor(size_small, api::kFloat, 4);
+
+  ValueRef c = graph.add_tensor(size_big, api::kFloat, 6);
+
+  auto addFn = VK_GET_OP_FN("aten.add.Tensor");
+  addFn(graph, {a.value, b.value, kDummyValueRef, c});
+
+  int n = 100;
+
+  for (int i = 0; i < n; i++) {
+    addFn(graph, {c, b.value, kDummyValueRef, a.value});
+
+    addFn(graph, {a.value, b.value, kDummyValueRef, c});
+  }
+
+  IOValueRef out = {};
+  out.value = c;
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  for (int i = 0; i < 10; i++) {
+    float val_a = 1.0f;
+    float val_b = 2.0f;
+
+    float val_e = val_a + val_b * (2 * n + 1);
+
+    fill_vtensor(graph, a, val_a);
+    fill_vtensor(graph, b, val_b);
+
+    graph.execute();
+
+    EXTRACT_TENSOR(out);
+
+    for (const auto& val : data_out) {
+      EXPECT_TRUE(val == val_e);
+    }
+  }
 }
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 293d114e8d..91a85f15a1 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -6,8 +6,6 @@
 
 from typing import final, List
 
-import executorch.backends.vulkan.serialization.vulkan_graph_schema as vk_graph_schema
-
 from executorch.backends.vulkan.serialization.vulkan_graph_builder import VkGraphBuilder
 from executorch.backends.vulkan.serialization.vulkan_graph_serialize import (
     serialize_vulkan_graph,
@@ -22,21 +20,15 @@
 
 from executorch.exir.passes import MemoryPlanningPass, SpecPropPass
 
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
+
 from executorch.exir.program._program import _copy_module
-from torch import dtype, float32
 
 DEFAULT_DEBUG_HANDLE = 65535
 
 
 @final
 class VulkanBackend(BackendDetails):
-    @staticmethod
-    def get_vk_datatype(torch_dtype: dtype) -> vk_graph_schema.VkDataType:
-        if torch_dtype == float32:
-            return vk_graph_schema.VkDataType.fp32
-        else:
-            raise AssertionError(f"Invalid dtype for vulkan_preprocess ({torch_dtype})")
-
     @classmethod
     # pyre-ignore
     def preprocess(  # noqa: C901
@@ -46,6 +38,7 @@ def preprocess(  # noqa: C901
     ) -> PreprocessResult:
         passes = [
             SpecPropPass(),
+            ConstraintBasedSymShapeEvalPass(),
             MemoryPlanningPass("greedy"),
         ]
 
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index ef593e0609..c184f60f08 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -149,8 +149,8 @@ Error defineTensor(
     ValuePtr value,
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
-    XNNExecutor* executor,
-    MemoryAllocator* runtime_allocator) {
+    std::vector<uint32_t>& input_ids,
+    std::vector<uint32_t>& output_ids) {
   const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
   const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;
 
@@ -272,7 +272,6 @@ Error defineTensor(
               /*external_id=*/tensor_value->external_id(),
               /*flags=*/tensor_value->flags(),
               /*id_out=*/&float_id);
-          executor->addDynamicQinput(float_id);
 
           // Define dynamic conversion from float to qdint8
           status = xnn_define_convert(
@@ -391,10 +390,13 @@ Error defineTensor(
 
   // map serialized id to newly generated id
   remapped_ids.emplace(std::make_pair(tensor_value->id_out(), id));
-  // Append this external id to the arg list for execute(*args) to extract from
-  // as args[external_id]
-  if (tensor_value->external_id() != XNN_INVALID_VALUE_ID) {
-    executor->append_arg(tensor_value->external_id());
+
+  // Add external ids to either list of input or output ids
+  if (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_INPUT) {
+    input_ids.push_back(tensor_value->external_id());
+  }
+  if (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_OUTPUT) {
+    output_ids.push_back(tensor_value->external_id());
   }
 
   return Error::Ok;
@@ -1594,6 +1596,9 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
   // Invalid ids do not need to be remapped
   remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);
 
+  // External Ids for inputs and outputs
+  std::vector<uint32_t> input_ids;
+  std::vector<uint32_t> output_ids;
   Error err = Error::Ok;
   for (auto value : *flatbuffer_graph->xvalues()) {
     err = defineTensor(
@@ -1602,8 +1607,8 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
         value,
         flatbuffer_graph,
         constant_data,
-        executor,
-        runtime_allocator);
+        input_ids,
+        output_ids);
 
     if (err != Error::Ok) {
       return err;
@@ -1635,47 +1640,10 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
       "XNN Runtime creation failed with code: %s",
       xnn_status_to_string(status));
 
-  executor->initialize(runtime_ptr); // NOLINT: runtime_ptr is non-null as
-                                     // error is checked above.
-
-  // HACK FOR FC/BC this is only to support old dq_datatype
-  if (executor->qinputs_.size() > 0) {
-    // qinputs_ is only set when using the old dq linear path. At which point
-    // We need to overide the input_ids_ This workse based off the assumption
-    // old dqlinear path will be single node single input delegate
-    for (uint32_t id : executor->qinputs_) {
-      executor->input_ids_.emplace_back(id);
-    }
-  } else {
-    for (auto old_id : *flatbuffer_graph->input_ids()) {
-      executor->input_ids_.emplace_back(remapped_ids.at(old_id));
-    }
-  }
-  // External ids need to be in order for wiring with args
-  std::sort(executor->input_ids_.begin(), executor->input_ids_.end());
-
-  for (auto old_id : *flatbuffer_graph->output_ids()) {
-    executor->output_ids_.emplace_back(remapped_ids.at(old_id));
-  }
-  // External ids need to be in order for wiring with args
-  std::sort(executor->output_ids_.begin(), executor->output_ids_.end());
-
-  if (!executor->qinputs_.empty() && flatbuffer_graph->xnodes()->size() > 0 &&
-      flatbuffer_graph->xnodes()->Get(0)->xnode_union_type() ==
-          fb_xnnpack::XNodeUnion::XNNFullyConnected) {
-#ifdef ENABLE_DYNAMIC_QUANTIZATION
-    // This delegate is for DQLinear which supports dynamic input shapes
-    if (executor->getNumInputs() < 1 || executor->getNumOutputs() != 1) {
-      ET_LOG(
-          Error,
-          "DQLinear should have at least one input and exactly one output");
-      return Error::NotSupported;
-    }
-#else
-    ET_LOG(Error, "DQ Linear is not supported");
-    return Error::NotSupported;
-#endif
-  }
+  err = executor->initialize( // NOLINT: runtime_ptr is non-null
+      runtime_ptr,
+      std::move(input_ids),
+      std::move(output_ids));
 
   return err;
 };
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
index 592d9574d1..a5cd3b6737 100644
--- a/backends/xnnpack/runtime/XNNExecutor.cpp
+++ b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -13,34 +13,208 @@ namespace executor {
 namespace xnnpack {
 namespace delegate {
 
-Error XNNExecutor::set_external_input(
-    uint32_t id,
-    Tensor* input,
-    struct XNNShape* shape) {
-  // TODO(T165403530): Test ensure accuracy for int64 --> float32 conversion
-  if (input->scalar_type() == ScalarType::Long) {
-    // Input data type is int64. However, XNNPACK doesn't support
-    // int64. This means that the data needs to be casted to float
-    // In order for XNNPACK to properly use it.
-    const int64_t* data_64 = input->const_data_ptr<int64_t>();
-    float* data_f32 = input->mutable_data_ptr<float>();
-    for (int j = 0; j < input->numel(); j++) {
-      data_f32[j] = data_64[j];
+using Tensor = exec_aten::Tensor;
+using ScalarType = exec_aten::ScalarType;
+using SizesType = exec_aten::SizesType;
+
+/**
+ * Initializes the XNNExecutor with the runtime and given number of
+ * inputs/outputs externals_ is resized to the total number of inputs and
+ * outputs
+ */
+__ET_NODISCARD Error XNNExecutor::initialize(
+    xnn_runtime_t runtime,
+    std::vector<uint32_t>&& input_ids,
+    std::vector<uint32_t>&& output_ids) {
+  runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
+      runtime, xnn_delete_runtime);
+
+  auto error = profiler_.initialize(runtime);
+  if (error != Error::Ok) {
+    ET_LOG(
+        Error,
+        "Failed to start profiling: %u.",
+        static_cast<unsigned int>(error));
+  }
+
+  // Initialize the external values for inputs and outputs
+  // mapping the executorch arg idx to external IDs
+  input_ids_ = std::move(input_ids);
+  std::sort(input_ids_.begin(), input_ids_.end());
+
+  output_ids_ = std::move(output_ids);
+  std::sort(output_ids_.begin(), output_ids_.end());
+
+  externals_.resize(input_ids_.size() + output_ids_.size());
+
+  return Error::Ok;
+}
+
+/**
+ * Prepares the args for XNNPACK Runtime.
+ *
+ * Creates an array of xnn_externals_values from the EValues passed in.
+ * Reshapes all the external input tensors, in case any input shapes have
+ * changed. The reshapes the entire runtime, propagating shape information
+ * through the runtime.
+ *
+ * Note: the external ids given to the external tensors in the XNNPACK
+ * runtime correspond to their index in the list of arg passed into
+ * delegate->execute()
+ */
+__ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
+  // Create xnn_externals_value from evalue args
+  xnn_status status;
+  for (uint32_t i = 0; i < externals_.size(); ++i) {
+    if (i < input_ids_.size()) {
+      externals_[i].id = input_ids_[i];
+    } else {
+      externals_[i].id = output_ids_[i - input_ids_.size()];
+    }
+    uint32_t ext_id = externals_[i].id;
+
+    ET_CHECK_OR_RETURN_ERROR(
+        args[ext_id]->isTensor(),
+        InvalidArgument,
+        "Expected argument to delegate at index %u to be a Tensor, but got %" PRIu32,
+        i,
+        static_cast<uint32_t>(args[ext_id]->tag));
+
+    Tensor* tensor = &args[ext_id]->toTensor();
+    externals_[i].data = tensor->mutable_data_ptr<float>();
+
+    // Reshape runtime inputs
+    if (i < input_ids_.size()) {
+      size_t num_dims = tensor->dim();
+      size_t dims[XNN_MAX_TENSOR_DIMS];
+      for (int d = 0; d < num_dims; ++d) {
+        dims[d] = tensor->size(d);
+      }
+      status =
+          xnn_reshape_external_value(runtime_.get(), ext_id, num_dims, dims);
+      ET_CHECK_OR_RETURN_ERROR(
+          status == xnn_status_success,
+          Internal,
+          "Internal Error: Reshape Input Tensor Failed with code: %s",
+          xnn_status_to_string(status));
     }
   }
-  if (input->dim() != shape->num_dims) {
-    ET_LOG(Error, "Input dim mismatch between tensor and shape struct");
+  // // Propagate Input Shape and Memory Plan for increased allocation
+  status = xnn_reshape_runtime(runtime_.get());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Internal Error: Propagating input shapes failed with code: %s",
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
+/**
+ * Runs the XNNPACK Runtime.
+ *
+ * We first setup the runtime by feeding the externals_ to runtime setup.
+ * After which we then execute the runtime through invoke_runtime.
+ */
+__ET_NODISCARD Error XNNExecutor::forward(BackendExecutionContext& context) {
+  ET_CHECK_OR_RETURN_ERROR(
+      runtime_ != nullptr,
+      Internal,
+      "XNNPACK Delegate did not compile correctly");
+
+  xnn_status status = xnn_setup_runtime_v2(
+      runtime_.get(), externals_.size(), externals_.data());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Internal Error: Setting up the runtime failed with code: %s",
+      xnn_status_to_string(status));
+
+  auto error = profiler_.start(context.event_tracer());
+  if (error != Error::Ok) {
+    ET_LOG(
+        Error,
+        "Failed to start profiling: %u.",
+        static_cast<unsigned int>(error));
+  }
+
+  status = xnn_invoke_runtime(runtime_.get());
+
+  error = profiler_.end();
+  if (error != Error::Ok) {
+    ET_LOG(
+        Error,
+        "Failed to end profiling: %u.",
+        static_cast<unsigned int>(error));
+  }
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "XNN Runtime invoke failed with code: %s",
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
+/**
+ * Prepares the outputs for ExecuTorch
+ *
+ * Resizes the output tensors based on the output shapes returned by
+ * the xnnpack runtime.
+ *
+ * Note: For arg_max pooling, we recast the output index tensor. Since
+ * XNNPACK gives the index tensor to us as int32, we need to convert it
+ * back to int64 for ExecuTorch.
+ */
+__ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const {
+  size_t output_idx_start = input_ids_.size();
+  for (size_t i = output_idx_start; i < externals_.size(); ++i) {
+    uint32_t ext_id = externals_[i].id;
+    Tensor* out_tensor = &args[ext_id]->toTensor();
+
+    size_t num_dim;
+    size_t dims[XNN_MAX_TENSOR_DIMS];
+
+    // Fetch the updated output shapes from xnnpack runtime
+    xnn_status status =
+        xnn_get_external_value_shape(runtime_.get(), ext_id, &num_dim, dims);
+
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Internal Error: Failed to retrieve graph output shapes");
+
+    // Convert new output shape into SizesType
+    SizesType expected_output_size[kTensorDimensionLimit];
+    for (size_t d = 0; d < num_dim; ++d) {
+      expected_output_size[d] = static_cast<SizesType>(dims[d]);
+    }
+
+    exec_aten::ArrayRef<SizesType> output_size{
+        expected_output_size, static_cast<size_t>(num_dim)};
+
+    ET_LOG(Debug, "Resizing output tensor to a new shape");
+    Error err = resize_tensor(*out_tensor, output_size);
+    if (err != Error::Ok) {
+      ET_LOG(Error, "Failed to resize output tensor for XNNExecutor");
+      return err;
+    }
+
+    // Output datatype is int64. However, XNNPACK doesn't support
+    // int64. This means that the data was put into this tensor
+    // by XNNPACK as int32 and needs to be copied to int64 form
+    if (out_tensor->scalar_type() == ScalarType::Long) {
+      int64_t* data_64 = out_tensor->mutable_data_ptr<int64_t>();
+      const int32_t* data_32 = out_tensor->const_data_ptr<int32_t>();
+      for (size_t j = out_tensor->numel() - 1; j >= 0; --j) {
+        data_64[j] = data_32[j];
+      }
+    }
   }
 
-#ifdef ENABLE_DYNAMIC_QUANTIZATION
-  externals_.emplace_back(xnn_external_value{
-      id,
-      input->mutable_data_ptr(),
-      static_cast<size_t>(shape->num_dims),
-      shape->dim});
-#else
-  externals_.emplace_back(xnn_external_value{id, input->mutable_data_ptr()});
-#endif
   return Error::Ok;
 }
 
diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h
index d4a4677392..b13951bdd1 100644
--- a/backends/xnnpack/runtime/XNNExecutor.h
+++ b/backends/xnnpack/runtime/XNNExecutor.h
@@ -24,11 +24,6 @@ namespace executor {
 namespace xnnpack {
 namespace delegate {
 
-struct XNNShape {
-  size_t num_dims;
-  size_t dim[XNN_MAX_TENSOR_DIMS];
-};
-
 class XNNExecutor {
  private:
   std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> runtime_{
@@ -38,42 +33,11 @@ class XNNExecutor {
   profiling::XNNProfiler profiler_;
   std::vector<uint32_t> input_ids_;
   std::vector<uint32_t> output_ids_;
-  std::vector<uint32_t> external_id_args_;
-  bool is_sorted_args_list_ = false;
   std::vector<xnn_external_value> externals_;
-  std::vector<uint32_t> qinputs_;
-
-  Error set_external_input(uint32_t id, Tensor* input, struct XNNShape* shape);
 
  public:
   XNNExecutor() = default;
 
-  inline void append_arg(uint32_t id) {
-    external_id_args_.push_back(id);
-    // Insertion order is not guaranteed here.
-    is_sorted_args_list_ = false;
-  }
-
-  inline size_t get_args_size() {
-    return external_id_args_.size();
-  }
-
-  inline uint32_t get_arg_index(size_t i) {
-    if (!is_sorted_args_list_) {
-      // Could have been inserted out of order.
-      sort(external_id_args_.begin(), external_id_args_.end());
-      is_sorted_args_list_ = true;
-    }
-
-    size_t ret = external_id_args_.size();
-    ET_CHECK_MSG(
-        i < ret,
-        "Invalid arg index, requested: %zu, total args consumed by xnnpack: %zu\n",
-        i,
-        ret);
-    return external_id_args_[i];
-  }
-
   inline size_t getNumInputs() {
     return input_ids_.size();
   }
@@ -82,147 +46,35 @@ class XNNExecutor {
     return output_ids_.size();
   }
 
-  inline void initialize(xnn_runtime_t runtime) {
-    runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
-        runtime, xnn_delete_runtime);
-
-    auto error = profiler_.initialize(runtime);
-    ET_CHECK_MSG(
-        error == Error::Ok,
-        "Failed to initialize profiler with error: %d",
-        static_cast<int>(error));
-  }
-
-  inline void addDynamicQinput(uint32_t id) {
-    qinputs_.emplace_back(id);
-  }
-
-  __ET_NODISCARD Error set_inputs(
-      std::vector<Tensor*>& inputs,
-      std::vector<Tensor*>& outputs,
-      std::vector<struct XNNShape>& input_shapes,
-      std::vector<struct XNNShape>& output_shapes) {
-    externals_.clear();
-
-    ET_CHECK_OR_RETURN_ERROR(
-        inputs.size() == input_ids_.size(),
-        InvalidArgument,
-        "Expected %zu inputs but given %zu",
-        input_ids_.size(),
-        inputs.size());
-
-    for (int i = 0; i < inputs.size(); i++) {
-      auto err = set_external_input(input_ids_[i], inputs[i], &input_shapes[i]);
-      ET_CHECK_OR_RETURN_ERROR(
-          err == Error::Ok, Internal, "Failed to set_external_input");
-    }
-    ET_CHECK_OR_RETURN_ERROR(
-        outputs.size() == output_ids_.size(),
-        InvalidArgument,
-        "Expected %zu outputs gut given %zu",
-        output_ids_.size(),
-        outputs.size());
-
-    for (int i = 0; i < outputs.size(); i++) {
-#ifdef ENABLE_DYNAMIC_QUANTIZATION
-      externals_.emplace_back(xnn_external_value{
-          output_ids_[i],
-          outputs[i]->mutable_data_ptr<float>(),
-          static_cast<size_t>(output_shapes[i].num_dims),
-          output_shapes[i].dim});
-#else
-      externals_.emplace_back(xnn_external_value{
-          output_ids_[i], outputs[i]->mutable_data_ptr<float>()});
-#endif
-    }
-
-    return Error::Ok;
-  }
-
-  __ET_NODISCARD Error forward(BackendExecutionContext& context) {
-    ET_CHECK_OR_RETURN_ERROR(
-        runtime_ != nullptr,
-        Internal,
-        "XNNPACK Delegate did not compile correctly");
-    xnn_status status =
-        xnn_setup_runtime(runtime_.get(), externals_.size(), externals_.data());
-
-    ET_CHECK_OR_RETURN_ERROR(
-        status == xnn_status_success,
-        Internal,
-        "XNN Runtime setup failed with code: %s",
-        xnn_status_to_string(status));
-
-    auto error = profiler_.start(context.event_tracer());
-    if (error != Error::Ok) {
-      ET_LOG(
-          Error,
-          "Failed to start profiling: %u.",
-          static_cast<unsigned int>(error));
-    }
-
-    status = xnn_invoke_runtime(runtime_.get());
-
-    error = profiler_.end();
-    if (error != Error::Ok) {
-      ET_LOG(
-          Error,
-          "Failed to end profiling: %u.",
-          static_cast<unsigned int>(error));
-    }
-
-    ET_CHECK_OR_RETURN_ERROR(
-        status == xnn_status_success,
-        Internal,
-        "XNN Runtime invoke failed with code: %s",
-        xnn_status_to_string(status));
-
-    return Error::Ok;
-  }
-
-  /** Resize output tensor to support dynamic input shapes */
-  __ET_NODISCARD Error resizeOutput(
-      exec_aten::Tensor* output_tensor,
-      struct XNNShape* output_shape) const {
-    const size_t n_dim = output_tensor->dim();
-
-    // Rank can't change
-    if (n_dim != output_shape->num_dims) {
-      ET_LOG(
-          Error,
-          "Found output shape with a different number of dimensions than the output tensor. Expected: %zu, Actual: %zu",
-          n_dim,
-          output_shape->num_dims);
-      return Error::NotSupported;
-    }
-
-    // Early exit?
-    bool same_shape = true;
-    for (size_t i = 0; (i < n_dim) && same_shape; i++) {
-      same_shape = (output_tensor->size(i) == output_shape->dim[i]);
-    }
-    if (same_shape) {
-      return Error::Ok;
-    }
-
-    exec_aten::SizesType expected_output_size[kTensorDimensionLimit];
-    for (size_t i = 0; i < n_dim; i++) {
-      expected_output_size[i] =
-          static_cast<exec_aten::SizesType>(output_shape->dim[i]);
-    }
-
-    exec_aten::ArrayRef<exec_aten::SizesType> output_size{
-        expected_output_size, static_cast<size_t>(output_tensor->dim())};
-
-    // Ok to dereference pointer here because resize_tensor takes in a tensor
-    // and not a tensor&
-    ET_LOG(Debug, "Resizing output tensor to a new shape");
-    Error err = resize_tensor(*output_tensor, output_size);
-    if (err != Error::Ok) {
-      ET_LOG(Error, "Failed to resize output tensor for XNNExecutor");
-    }
-    return err;
-  }
+  /**
+   * Initialize the XNNExecutor with a given runtime and input/output ids.
+   * The input/output ids are expected to be sorted in order of their
+   * flatbuffer id_outs
+   */
+  __ET_NODISCARD Error initialize(
+      xnn_runtime_t runtime,
+      std::vector<uint32_t>&& input_ids,
+      std::vector<uint32_t>&& output_ids);
+
+  /**
+   * Prepares the arguments for runtime graph execution.
+   * args is an array of EValues that will be passed into the runtime.
+   * input shapes will be propagated through the runtime, and perform
+   * any additional memory planning as needed
+   */
+  __ET_NODISCARD Error prepare_args(EValue** args);
+
+  /**
+   * Executes the graph using the args prepared at prepare_args().
+   */
+  __ET_NODISCARD Error forward(BackendExecutionContext& context);
+
+  /**
+   * Prepares the outputs to be returned by the delegate
+   *
+   * Performs any post processing of outputs like tensor resizing
+   */
+  __ET_NODISCARD Error resize_outputs(EValue** args) const;
 
   friend class XNNCompiler;
 };
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index 7fd921ad9d..33d7ebebfe 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -60,84 +60,20 @@ class XnnpackBackend final : public PyTorchBackendInterface {
       EValue** args) const override {
     auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
 
-    // TODO merge these two in a single struct?
-    std::vector<Tensor*> input_pointers;
-    std::vector<Tensor*> output_pointers;
-    std::vector<xnnpack::delegate::XNNShape> input_shapes;
-    std::vector<xnnpack::delegate::XNNShape> output_shapes;
-
-    ET_CHECK_OR_RETURN_ERROR(
-        executor->get_args_size() ==
-            executor->getNumInputs() + executor->getNumOutputs(),
-        Internal,
-        "External id and expected delegate args mismatch");
-
-    // Intialize XNNShapes for both inputs and outputs.
-    // That will allow us to gradually build up shape inference support for
-    // xnnpack ops without breaking existing models when we always try to resize
-    // delegate output tensor(s).
-    input_shapes.resize(executor->getNumInputs());
-    output_shapes.resize(executor->getNumOutputs());
-
-    for (int i = 0; i < executor->get_args_size(); i++) {
-      int index = executor->get_arg_index(i);
-
-      if (!args[index]->isTensor()) {
-        ET_LOG(Error, "Expected argument to be a tensor");
-      }
-
-      Tensor* tensor = &args[index]->toTensor();
-      size_t num_dims = tensor->dim();
-      struct xnnpack::delegate::XNNShape* shape = nullptr;
-
-      if (i < executor->getNumInputs()) {
-        input_pointers.push_back(tensor);
-        shape = &input_shapes[i];
-      } else {
-        output_pointers.push_back(tensor);
-        shape = &output_shapes[i - executor->getNumInputs()];
-      }
-
-      shape->num_dims = num_dims;
-      for (int d = 0; d < num_dims; ++d) {
-        shape->dim[d] = tensor->size(d);
-      }
-    }
-
-    Error err = executor->set_inputs(
-        input_pointers, output_pointers, input_shapes, output_shapes);
-
+    // Prepare Inputs/Outputs and Propagate Input Shapes
+    Error err = executor->prepare_args(args);
     if (err != Error::Ok) {
       return err;
     }
 
     err = executor->forward(context);
 
-    // Resize output tensors - should be a no-op for static models
-    for (int i = 0; i < executor->getNumOutputs(); i++) {
-      err = executor->resizeOutput(output_pointers[i], &output_shapes[i]);
-      if (err != Error::Ok) {
-        return err;
-      }
+    if (err != Error::Ok) {
+      return err;
     }
 
-    for (int i = executor->getNumInputs();
-         i < executor->getNumInputs() + executor->getNumOutputs();
-         i++) {
-      if (args[i]->isTensor()) {
-        exec_aten::Tensor output_tensor = args[i]->toTensor();
-        if (output_tensor.scalar_type() == ScalarType::Long) {
-          // Output datatype is int64. However, XNNPACK doesn't support
-          // int64. This means that the data was put into this tensor
-          // by XNNPACK as int32 and needs to be copied to int64 form
-          int64_t* data_64 = output_tensor.mutable_data_ptr<int64_t>();
-          const int32_t* data_32 = output_tensor.const_data_ptr<int32_t>();
-          for (int j = output_tensor.numel() - 1; j >= 0; j--) {
-            data_64[j] = data_32[j];
-          }
-        }
-      }
-    }
+    // Resize outputs and recast pointers if necessary
+    err = executor->resize_outputs(args);
 
     return err;
   }
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index 92043b5fea..915549155d 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -37,7 +37,7 @@ def define_common_targets():
         ],
         preprocessor_flags = [
             # "-DENABLE_XNNPACK_PROFILING",
-        ] + ([] if runtime.is_oss else ["-DENABLE_DYNAMIC_QUANTIZATION"]),
+        ],
         deps = [
             third_party_dep("XNNPACK"),
             "//executorch/runtime/backend:interface",
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index ae8fcc23db..ec03fa2529 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -501,12 +501,25 @@ def _assert_outputs_equal(model_output, ref_output, atol=1e-03, rtol=1e-03):
         assert len(model_output) == len(ref_output)
 
         for i in range(len(model_output)):
+            model = model_output[i]
+            ref = ref_output[i]
             assert torch.allclose(
-                model_output[i],
-                ref_output[i],
+                model,
+                ref,
                 atol=atol,
                 rtol=rtol,
-            ), f" Output {i} does not match reference output. Max difference: {torch.max(torch.abs(model_output[i] - ref_output[i]))}"
+            ), (
+                f"Output {i} does not match reference output.\n"
+                f"\tGiven atol: {atol}, rtol: {rtol}.\n"
+                f"\tOutput tensor shape: {model.shape}, dtype: {model.dtype}\n"
+                f"\tDifference: max: {torch.max(model-ref)}, abs: {torch.max(torch.abs(model-ref))}.\n"
+                f"\t-- Model vs. Reference --\n"
+                f"\t Numel: {model.numel()}, {ref.numel()}\n"
+                f"\tMedian: {model.median()}, {ref.median()}\n"
+                f"\t  Mean: {model.mean()}, {ref.mean()}\n"
+                f"\t   Max: {model.max()}, {ref.max()}\n"
+                f"\t   Min: {model.min()}, {ref.min()}\n"
+            )
 
     def compare_outputs(self, atol=1e-03, rtol=1e-03, qtol=0):
         """
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index d9cce341f8..fcbf55af6c 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit d9cce341f86a207da9d851d05e26cd50b508b73c
+Subproject commit fcbf55af6cf28a4627bcd1f703ab7ad843f0f3a2
diff --git a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py
index d378f39489..bda7952717 100644
--- a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py
+++ b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py
@@ -36,6 +36,7 @@
     "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
     "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
@@ -80,6 +81,7 @@
     "PROD_AVX512F_MICROKERNEL_SRCS",
     "PROD_AVX512SKX_MICROKERNEL_SRCS",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS",
     "PROD_RVV_MICROKERNEL_SRCS",
     "PROD_AVXVNNI_MICROKERNEL_SRCS",
diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl
index 4f3dafb5cb..a1add44664 100644
--- a/backends/xnnpack/third-party/xnnpack.buck.bzl
+++ b/backends/xnnpack/third-party/xnnpack.buck.bzl
@@ -17,6 +17,7 @@ load(
     "PROD_AVX512F_MICROKERNEL_SRCS",
     "PROD_AVX512SKX_MICROKERNEL_SRCS",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS",
     "PROD_AVXVNNI_MICROKERNEL_SRCS",
     "PROD_AVX_MICROKERNEL_SRCS",
@@ -1125,9 +1126,45 @@ def define_xnnpack():
         ],
     )
 
+    AVX512VNNIGFNI_COMPILER_FLAGS = AVX512VNNI_COMPILER_FLAGS + [
+        "-mgfni",
+    ]
+
+    # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
+    native.cxx_library(
+        name = "ukernels_avx512vnnigfni",
+        srcs = select({
+            "DEFAULT": PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS,
+            "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
+            "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
+        }),
+        headers = subdir_glob([
+            ("XNNPACK/src", "**/*.h"),
+            ("XNNPACK/src", "**/*.c"),
+        ]),
+        header_namespace = "",
+        compiler_flags = [
+            "-O2",
+            "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
+        ] + select({
+            "DEFAULT": AVX512VNNIGFNI_COMPILER_FLAGS,
+            "ovr_config//cpu:arm32": [],
+            "ovr_config//cpu:arm64": [],
+        }),
+        preferred_linkage = "static",
+        preprocessor_flags = [
+            "-DXNN_LOG_LEVEL=0",
+        ],
+        exported_deps = [
+            ":interface",
+        ],
+    )
+
     AVXVNNI_COMPILER_FLAGS = [
         "-mavx2",
         "-mavxvnni",
+        "-mf16c",
+        "-mfma",
     ]
 
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
@@ -1180,6 +1217,7 @@ def define_xnnpack():
         ":ukernels_ssse3",
         ":ukernels_xop",
         ":ukernels_avx512vbmi",
+        ":ukernels_avx512vnnigfni",
         ":ukernels_avx512vnni",
         ":ukernels_avxvnni",
     ]
diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
index a9e60e4f17..0a0beba7ef 100644
--- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl
+++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
@@ -2,99 +2,32 @@
 Auto-generated by generate-wrappers.py script. Do not modify
 """
 
-SUBGRAPH_SRCS = [
-    "XNNPACK/src/memory-planner.c",
-    "XNNPACK/src/runtime.c",
-    "XNNPACK/src/subgraph.c",
-    "XNNPACK/src/subgraph/abs.c",
-    "XNNPACK/src/subgraph/add2.c",
-    "XNNPACK/src/subgraph/argmax-pooling-2d.c",
-    "XNNPACK/src/subgraph/average-pooling-2d.c",
-    "XNNPACK/src/subgraph/bankers-rounding.c",
-    "XNNPACK/src/subgraph/batch-matrix-multiply.c",
-    "XNNPACK/src/subgraph/ceiling.c",
-    "XNNPACK/src/subgraph/clamp.c",
-    "XNNPACK/src/subgraph/concatenate.c",
-    "XNNPACK/src/subgraph/convert.c",
-    "XNNPACK/src/subgraph/convolution-2d.c",
-    "XNNPACK/src/subgraph/copy.c",
-    "XNNPACK/src/subgraph/deconvolution-2d.c",
-    "XNNPACK/src/subgraph/depth-to-space-2d.c",
-    "XNNPACK/src/subgraph/depthwise-convolution-2d.c",
-    "XNNPACK/src/subgraph/divide.c",
-    "XNNPACK/src/subgraph/elu.c",
-    "XNNPACK/src/subgraph/even-split.c",
-    "XNNPACK/src/subgraph/floor.c",
-    "XNNPACK/src/subgraph/fully-connected-sparse.c",
-    "XNNPACK/src/subgraph/fully-connected.c",
-    "XNNPACK/src/subgraph/global-average-pooling.c",
-    "XNNPACK/src/subgraph/global-sum-pooling.c",
-    "XNNPACK/src/subgraph/hardswish.c",
-    "XNNPACK/src/subgraph/leaky-relu.c",
-    "XNNPACK/src/subgraph/max-pooling-2d.c",
-    "XNNPACK/src/subgraph/maximum2.c",
-    "XNNPACK/src/subgraph/minimum2.c",
-    "XNNPACK/src/subgraph/multiply2.c",
-    "XNNPACK/src/subgraph/negate.c",
-    "XNNPACK/src/subgraph/prelu.c",
-    "XNNPACK/src/subgraph/scaled-dot-product-attention.c",
-    "XNNPACK/src/subgraph/sigmoid.c",
-    "XNNPACK/src/subgraph/softmax.c",
-    "XNNPACK/src/subgraph/space-to-depth-2d.c",
-    "XNNPACK/src/subgraph/square-root.c",
-    "XNNPACK/src/subgraph/square.c",
-    "XNNPACK/src/subgraph/squared-difference.c",
-    "XNNPACK/src/subgraph/static-constant-pad.c",
-    "XNNPACK/src/subgraph/static-mean.c",
-    "XNNPACK/src/subgraph/static-reshape.c",
-    "XNNPACK/src/subgraph/static-resize-bilinear-2d.c",
-    "XNNPACK/src/subgraph/static-slice.c",
-    "XNNPACK/src/subgraph/static-transpose.c",
-    "XNNPACK/src/subgraph/subtract.c",
-    "XNNPACK/src/subgraph/tanh.c",
-    "XNNPACK/src/subgraph/unpooling-2d.c",
-    "XNNPACK/src/subgraph/validation.c",
-    "XNNPACK/src/tensor.c",
-]
-
-PROD_NEONI8MM_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neoni8mm.c",
-]
-
-PROD_SSE41_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/sse41.c",
-]
-
-PROD_SSSE3_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/ssse3.c",
+PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neondotfp16-aarch64.c",
 ]
 
-PROD_AVX512VBMI_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512vbmi.c",
+PROD_FP16ARITH_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/fp16arith.c",
 ]
 
-LOGGING_SRCS = [
-    "XNNPACK/src/enums/datatype-strings.c",
-    "XNNPACK/src/enums/microkernel-type.c",
-    "XNNPACK/src/enums/node-type.c",
-    "XNNPACK/src/enums/operator-type.c",
-    "XNNPACK/src/log.c",
+PROD_SSE_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/sse.c",
 ]
 
-PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondotfp16arith.c",
+PROD_FMA3_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/fma3.c",
 ]
 
-PROD_F16C_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/f16c.c",
+PROD_SSE2_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/sse2.c",
 ]
 
-PROD_NEON_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neon.c",
+PROD_NEONV8_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neonv8.c",
 ]
 
-PROD_RVV_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/rvv.c",
+PROD_AVX512SKX_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx512skx.c",
 ]
 
 AARCH32_ASM_MICROKERNEL_SRCS = [
@@ -124,28 +57,9 @@ AARCH32_ASM_MICROKERNEL_SRCS = [
     "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
     "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S",
     "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
+    "XNNPACK/src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
     "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
     "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
     "XNNPACK/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
     "XNNPACK/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
     "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
@@ -206,44 +120,17 @@ AARCH32_ASM_MICROKERNEL_SRCS = [
     "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S",
 ]
 
-PROD_AVX_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx.c",
-]
-
-PROD_XOP_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/xop.c",
-]
-
-PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfp16arith.c",
-]
-
-PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondot-aarch64.c",
-]
-
-PROD_AVX512SKX_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512skx.c",
-]
-
-PROD_ARMSIMD32_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/armsimd32.c",
-]
-
-PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfp16arith-aarch64.c",
-]
-
-PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondotfp16-aarch64.c",
+PROD_FMA_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/fma.c",
 ]
 
-PROD_AVX512VNNI_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512vnni.c",
+PROD_NEON_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neon-aarch64.c",
+    "XNNPACK/src/amalgam/gen/neonfma-aarch64.c",
 ]
 
-PROD_SSE2_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/sse2.c",
+PROD_SSSE3_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/ssse3.c",
 ]
 
 TABLE_SRCS = [
@@ -258,31 +145,20 @@ TABLE_SRCS = [
     "XNNPACK/src/tables/vlog.c",
 ]
 
-PROD_FP16ARITH_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/fp16arith.c",
-]
-
-PROD_NEONFMA_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfma.c",
-]
-
-JIT_SRCS = [
-    "XNNPACK/src/jit/aarch32-assembler.cc",
-    "XNNPACK/src/jit/aarch64-assembler.cc",
-    "XNNPACK/src/jit/assembler.cc",
+PROD_AVX512VBMI_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx512vbmi.c",
 ]
 
 PROD_AVX2_MICROKERNEL_SRCS = [
     "XNNPACK/src/amalgam/gen/avx2.c",
 ]
 
-PROD_AVXVNNI_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avxvnni.c",
+PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neondotfp16arith.c",
 ]
 
-PROD_NEON_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neon-aarch64.c",
-    "XNNPACK/src/amalgam/gen/neonfma-aarch64.c",
+PROD_AVXVNNI_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avxvnni.c",
 ]
 
 OPERATOR_SRCS = [
@@ -315,20 +191,80 @@ OPERATOR_SRCS = [
     "XNNPACK/src/operators/unpooling-nhwc.c",
 ]
 
-PROD_NEONDOT_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondot.c",
+PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neonfp16arith.c",
 ]
 
-PROD_SCALAR_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/scalar.c",
+PROD_F16C_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/f16c.c",
+]
+
+PROD_XOP_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/xop.c",
 ]
 
 PROD_AVX512F_MICROKERNEL_SRCS = [
     "XNNPACK/src/amalgam/gen/avx512f.c",
 ]
 
-PROD_FMA3_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/fma3.c",
+SUBGRAPH_SRCS = [
+    "XNNPACK/src/memory-planner.c",
+    "XNNPACK/src/runtime.c",
+    "XNNPACK/src/subgraph.c",
+    "XNNPACK/src/subgraph/abs.c",
+    "XNNPACK/src/subgraph/add2.c",
+    "XNNPACK/src/subgraph/argmax-pooling-2d.c",
+    "XNNPACK/src/subgraph/average-pooling-2d.c",
+    "XNNPACK/src/subgraph/bankers-rounding.c",
+    "XNNPACK/src/subgraph/batch-matrix-multiply.c",
+    "XNNPACK/src/subgraph/ceiling.c",
+    "XNNPACK/src/subgraph/clamp.c",
+    "XNNPACK/src/subgraph/concatenate.c",
+    "XNNPACK/src/subgraph/convert.c",
+    "XNNPACK/src/subgraph/convolution-2d.c",
+    "XNNPACK/src/subgraph/copy.c",
+    "XNNPACK/src/subgraph/deconvolution-2d.c",
+    "XNNPACK/src/subgraph/depth-to-space-2d.c",
+    "XNNPACK/src/subgraph/depthwise-convolution-2d.c",
+    "XNNPACK/src/subgraph/divide.c",
+    "XNNPACK/src/subgraph/elu.c",
+    "XNNPACK/src/subgraph/even-split.c",
+    "XNNPACK/src/subgraph/floor.c",
+    "XNNPACK/src/subgraph/fully-connected-sparse.c",
+    "XNNPACK/src/subgraph/fully-connected.c",
+    "XNNPACK/src/subgraph/global-average-pooling.c",
+    "XNNPACK/src/subgraph/global-sum-pooling.c",
+    "XNNPACK/src/subgraph/hardswish.c",
+    "XNNPACK/src/subgraph/leaky-relu.c",
+    "XNNPACK/src/subgraph/max-pooling-2d.c",
+    "XNNPACK/src/subgraph/maximum2.c",
+    "XNNPACK/src/subgraph/minimum2.c",
+    "XNNPACK/src/subgraph/multiply2.c",
+    "XNNPACK/src/subgraph/negate.c",
+    "XNNPACK/src/subgraph/prelu.c",
+    "XNNPACK/src/subgraph/reshape-helpers.c",
+    "XNNPACK/src/subgraph/scaled-dot-product-attention.c",
+    "XNNPACK/src/subgraph/sigmoid.c",
+    "XNNPACK/src/subgraph/softmax.c",
+    "XNNPACK/src/subgraph/space-to-depth-2d.c",
+    "XNNPACK/src/subgraph/square-root.c",
+    "XNNPACK/src/subgraph/square.c",
+    "XNNPACK/src/subgraph/squared-difference.c",
+    "XNNPACK/src/subgraph/static-constant-pad.c",
+    "XNNPACK/src/subgraph/static-mean.c",
+    "XNNPACK/src/subgraph/static-reshape.c",
+    "XNNPACK/src/subgraph/static-resize-bilinear-2d.c",
+    "XNNPACK/src/subgraph/static-slice.c",
+    "XNNPACK/src/subgraph/static-transpose.c",
+    "XNNPACK/src/subgraph/subtract.c",
+    "XNNPACK/src/subgraph/tanh.c",
+    "XNNPACK/src/subgraph/unpooling-2d.c",
+    "XNNPACK/src/subgraph/validation.c",
+    "XNNPACK/src/tensor.c",
+]
+
+PROD_RVV_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/rvv.c",
 ]
 
 AARCH64_ASM_MICROKERNEL_SRCS = [
@@ -514,87 +450,13 @@ AARCH64_ASM_MICROKERNEL_SRCS = [
     "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
     "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S",
+    "XNNPACK/src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
     "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S",
     "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
     "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
     "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
     "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
@@ -691,18 +553,68 @@ XNNPACK_SRCS = [
     "XNNPACK/src/params.c",
 ]
 
-PROD_FMA_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/fma.c",
+JIT_SRCS = [
+    "XNNPACK/src/jit/aarch32-assembler.cc",
+    "XNNPACK/src/jit/aarch64-assembler.cc",
+    "XNNPACK/src/jit/assembler.cc",
 ]
 
-PROD_NEONV8_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonv8.c",
+PROD_NEONFMA_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neonfma.c",
+]
+
+PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neonfp16arith-aarch64.c",
 ]
 
 PROD_NEONFP16_MICROKERNEL_SRCS = [
     "XNNPACK/src/amalgam/gen/neonfp16.c",
 ]
 
-PROD_SSE_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/sse.c",
+PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx512vnnigfni.c",
+]
+
+PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neondot-aarch64.c",
+]
+
+PROD_ARMSIMD32_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/armsimd32.c",
+]
+
+PROD_NEONDOT_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neondot.c",
+]
+
+PROD_SCALAR_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/scalar.c",
+]
+
+PROD_SSE41_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/sse41.c",
+]
+
+PROD_NEONI8MM_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neoni8mm.c",
+]
+
+LOGGING_SRCS = [
+    "XNNPACK/src/enums/datatype-strings.c",
+    "XNNPACK/src/enums/microkernel-type.c",
+    "XNNPACK/src/enums/node-type.c",
+    "XNNPACK/src/enums/operator-type.c",
+    "XNNPACK/src/log.c",
+]
+
+PROD_NEON_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neon.c",
+]
+
+PROD_AVX512VNNI_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx512vnni.c",
+]
+
+PROD_AVX_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx.c",
 ]
diff --git a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl
index 256633ff55..2dbb41ff01 100644
--- a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl
+++ b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl
@@ -115,6 +115,10 @@ PROD_AVX512VBMI_MICROKERNEL_SRCS = [
     "xnnpack_wrappers/amalgam/gen/avx512vbmi.c",
 ]
 
+PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/gen/avx512vnnigfni.c",
+]
+
 PROD_AVX512VNNI_MICROKERNEL_SRCS = [
     "xnnpack_wrappers/amalgam/gen/avx512vnni.c",
 ]
@@ -154,28 +158,9 @@ AARCH32_ASM_MICROKERNEL_SRCS = [
     "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
     "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S",
     "xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
+    "xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
     "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
     "xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
     "xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
     "xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
     "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
@@ -419,87 +404,13 @@ AARCH64_ASM_MICROKERNEL_SRCS = [
     "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
     "xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S",
+    "xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
     "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S",
     "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
     "xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
     "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
     "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vnnigfni.c b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vnnigfni.c
new file mode 100644
index 0000000000..c02b5a09ae
--- /dev/null
+++ b/backends/xnnpack/third-party/xnnpack_wrappers/amalgam/gen/avx512vnnigfni.c
@@ -0,0 +1,5 @@
+/* Auto-generated by generate-wrappers.py script. Do not modify */
+
+#if defined(__i386__) || defined(__i686__) || defined(__x86_64__)
+#include <amalgam/gen/avx512vnnigfni.c>
+#endif /* defined(__i386__) || defined(__i686__) || defined(__x86_64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S
new file mode 100644
index 0000000000..54c754b5d8
--- /dev/null
+++ b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S
@@ -0,0 +1,5 @@
+/* Auto-generated by generate-wrappers.py script. Do not modify */
+
+#if defined(__aarch64__)
+#include <qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S>
+#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
new file mode 100644
index 0000000000..41bee5072d
--- /dev/null
+++ b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S
@@ -0,0 +1,5 @@
+/* Auto-generated by generate-wrappers.py script. Do not modify */
+
+#if defined(__aarch64__)
+#include <qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S>
+#endif /* defined(__aarch64__) */
diff --git a/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S
new file mode 100644
index 0000000000..db2eda8704
--- /dev/null
+++ b/backends/xnnpack/third-party/xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S
@@ -0,0 +1,5 @@
+/* Auto-generated by generate-wrappers.py script. Do not modify */
+
+#if defined(__arm__)
+#include <qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S>
+#endif /* defined(__arm__) */
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
index 5ac8cd2c1d..61e7c2725f 100755
--- a/build/build_apple_frameworks.sh
+++ b/build/build_apple_frameworks.sh
@@ -11,20 +11,21 @@ PLATFORMS=("iphoneos" "iphonesimulator")
 PLATFORM_FLAGS=("OS" "SIMULATOR")
 SOURCE_ROOT_DIR=""
 OUTPUT="cmake-out"
-MODE="Debug"
+MODE="Release"
 TOOLCHAIN=""
-BUCK2="/tmp/buck2"
+BUCK2="buck2"
 PYTHON=$(which python3)
-FLATC=""
+FLATC="flatc"
 IOS_DEPLOYMENT_TARGET="17.0"
 COREML=OFF
 MPS=OFF
+PORTABLE=OFF
 XNNPACK=OFF
 HEADERS_PATH="include"
-EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
-PORTABLE_FRAMEWORK="portable_backend:libportable_kernels.a,libportable_ops_lib.a:"
+EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
 COREML_FRAMEWORK="coreml_backend:libcoremldelegate.a:"
 MPS_FRAMEWORK="mps_backend:libmpsdelegate.a:"
+PORTABLE_FRAMEWORK="portable_backend:libportable_kernels.a,libportable_ops_lib.a:"
 XNNPACK_FRAMEWORK="xnnpack_backend:libXNNPACK.a,libcpuinfo.a,libpthreadpool.a,libxnnpack_backend.a:"
 
 usage() {
@@ -34,14 +35,15 @@ usage() {
   echo
   echo "Options:"
   echo "  --output=DIR         Output directory. Default: 'cmake-out'"
-  echo "  --Release            Use Release build mode. Default: 'Debug'"
+  echo "  --Debug              Use Debug build mode. Default: 'Release'"
   echo "  --toolchain=FILE     Cmake toolchain file. Default: '\$SOURCE_ROOT_DIR/third-party/pytorch/cmake/iOS.cmake'"
   echo "  --buck2=FILE         Buck2 executable path. Default: '/tmp/buck2'"
   echo "  --python=FILE        Python executable path. Default: Path of python3 found in the current \$PATH"
   echo "  --flatc=FILE         FlatBuffers Compiler executable path. Default: '\$SOURCE_ROOT_DIR/third-party/flatbuffers/cmake-out/flatc'"
-  echo "  --coreml             Include this flag to build Core ML backend."
-  echo "  --mps                Include this flag to build Metal Performance Shaders backend."
-  echo "  --xnnpack            Include this flag to build XNNPACK backend."
+  echo "  --coreml             Include this flag to build the Core ML backend."
+  echo "  --mps                Include this flag to build the Metal Performance Shaders backend."
+  echo "  --portable           Include this flag to build the Portable backend."
+  echo "  --xnnpack            Include this flag to build the XNNPACK backend."
   echo
   echo "Example:"
   echo "  $0 /path/to/source/root --output=cmake-out --Release --toolchain=/path/to/cmake/toolchain --buck2=/path/to/buck2 --python=/path/to/python3 --coreml --mps --xnnpack"
@@ -52,13 +54,14 @@ for arg in "$@"; do
   case $arg in
       -h|--help) usage ;;
       --output=*) OUTPUT="${arg#*=}" ;;
-      --Release) MODE="Release" ;;
+      --Debug) MODE="Debug" ;;
       --toolchain=*) TOOLCHAIN="${arg#*=}" ;;
       --buck2=*) BUCK2="${arg#*=}" ;;
       --python=*) PYTHON="${arg#*=}" ;;
       --flatc=*) FLATC="${arg#*=}" ;;
       --ios-deployment-target=*) IOS_DEPLOYMENT_TARGET="${arg#*=}" ;;
       --coreml) COREML=ON ;;
+      --portable) PORTABLE=ON ;;
       --mps) MPS=ON ;;
       --xnnpack) XNNPACK=ON ;;
       *)
@@ -105,12 +108,14 @@ cmake_build() {
     echo "Building for $platform with flag $platform_flag"
     mkdir "$platform" && cd "$platform" || exit 1
     cmake "$SOURCE_ROOT_DIR" -G Xcode \
+        -DCMAKE_BUILD_TYPE="$MODE" \
         -DCMAKE_TOOLCHAIN_FILE="$TOOLCHAIN" \
         -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD="c++17" \
         -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY="libc++" \
         -DBUCK2="$BUCK2" \
         -DPYTHON_EXECUTABLE="$PYTHON" \
         -DFLATC_EXECUTABLE="$FLATC" \
+        -DEXECUTORCH_BUILD_EXTENSION_APPLE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DCMAKE_ARCHIVE_OUTPUT_DIRECTORY="$(pwd)" \
@@ -135,6 +140,8 @@ mkdir -p "$HEADERS_PATH"
   //extension/module: \
 | rsync -av --files-from=- "$SOURCE_ROOT_DIR" "$HEADERS_PATH/executorch"
 
+cp "$SOURCE_ROOT_DIR/extension/apple/ExecuTorch/Exported/"{*.h,*.modulemap} "$HEADERS_PATH"
+
 echo "Creating frameworks"
 
 for platform in "${PLATFORMS[@]}"; do
@@ -152,7 +159,7 @@ append_framework_flag() {
 }
 
 append_framework_flag "ON" "$EXECUTORCH_FRAMEWORK"
-append_framework_flag "ON" "$PORTABLE_FRAMEWORK"
+append_framework_flag "$PORTABLE" "$PORTABLE_FRAMEWORK"
 append_framework_flag "$COREML" "$COREML_FRAMEWORK"
 append_framework_flag "$MPS" "$MPS_FRAMEWORK"
 append_framework_flag "$XNNPACK" "$XNNPACK_FRAMEWORK"
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index bf1899534a..962eedf1dc 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -31,7 +31,7 @@ filters = [
 buck_targets = [
   # //kernels/portable:operators would be more appropriate, but buck2 doesn't
   # think it has any "inputs" since its srcs list is empty.
-  "//kernels/portable:generated_lib_all_ops",
+  "//kernels/portable:generated_lib",
 ]
 filters = [
   ".cpp$",
@@ -246,9 +246,10 @@ excludes = [
 ]
 deps = [
   "executorch",
+  "extension_data_loader",
   "extension_module",
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
 ]
-# ---------------------------------- LLama start ----------------------------------
+# ---------------------------------- LLama end ----------------------------------
diff --git a/build/packaging/post_build_script.sh b/build/packaging/post_build_script.sh
new file mode 100644
index 0000000000..fd71b18565
--- /dev/null
+++ b/build/packaging/post_build_script.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eux
+
+echo "This script is run after building ExecuTorch binaries"
diff --git a/build/packaging/pre_build_script.sh b/build/packaging/pre_build_script.sh
new file mode 100644
index 0000000000..3940168c40
--- /dev/null
+++ b/build/packaging/pre_build_script.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eux
+
+echo "This script is run before building ExecuTorch binaries"
diff --git a/build/packaging/smoke_test.py b/build/packaging/smoke_test.py
new file mode 100644
index 0000000000..5273a457f1
--- /dev/null
+++ b/build/packaging/smoke_test.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def main():
+    """
+    Run ExecuTorch binary smoke tests. This is a placeholder for future tests. See
+    https://github.com/pytorch/test-infra/wiki/Using-Nova-Reusable-Build-Workflows
+    for more information about Nova binary workflow.
+    """
+
+
+if __name__ == "__main__":
+    main()
diff --git a/build/test_ios.sh b/build/test_ios.sh
index 8c65eb6080..7b0b31262e 100755
--- a/build/test_ios.sh
+++ b/build/test_ios.sh
@@ -70,7 +70,7 @@ curl -LO "https://github.com/facebook/buck2/releases/download/$BUCK2_RELEASE_DAT
 zstd -cdq "$BUCK2_ARCHIVE" > "$BUCK2" && chmod +x "$BUCK2"
 rm "$BUCK2_ARCHIVE"
 
-./install_requirements.sh
+./install_requirements.sh --pybind coreml mps xnnpack
 export PATH="$(realpath third-party/flatbuffers/cmake-out):$PATH"
 ./build/install_flatc.sh
 
@@ -82,18 +82,10 @@ say "Installing MPS Backend Requirements"
 
 ./backends/apple/mps/install_requirements.sh
 
-say "Installing Python Bindings"
-
-EXECUTORCH_BUILD_PYBIND=ON \
-BUCK="$(pwd)/$BUCK2" \
-CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON" \
-CMAKE_BUILD_PARALLEL_LEVEL=9 \
-pip install . --no-build-isolation -v
-
 say "Exporting Models"
 
 python3 -m examples.portable.scripts.export --model_name="$MODEL_NAME"
-python3 -m examples.apple.coreml.scripts.export_and_delegate --model_name="$MODEL_NAME"
+python3 -m examples.apple.coreml.scripts.export --model_name="$MODEL_NAME"
 python3 -m examples.apple.mps.scripts.mps_example --model_name="$MODEL_NAME"
 python3 -m examples.xnnpack.aot_compiler --model_name="$MODEL_NAME" --delegate
 
@@ -107,7 +99,7 @@ curl https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt \
 
 say "Building Frameworks"
 
-./build/build_apple_frameworks.sh --buck2="$(realpath $BUCK2)" --Release --coreml --mps --xnnpack
+./build/build_apple_frameworks.sh --buck2="$(realpath $BUCK2)" --coreml --mps --portable --xnnpack
 mv cmake-out "$APP_PATH/Frameworks"
 
 say "Creating Simulator"
diff --git a/build/test_ios_ci.sh b/build/test_ios_ci.sh
index 41aeb127eb..8ca8cb4caa 100755
--- a/build/test_ios_ci.sh
+++ b/build/test_ios_ci.sh
@@ -39,18 +39,10 @@ say "Installing MPS Backend Requirements"
 
 ./backends/apple/mps/install_requirements.sh
 
-say "Installing Python Bindings"
-
-EXECUTORCH_BUILD_PYBIND=ON \
-BUCK="$(which buck2)" \
-CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON" \
-CMAKE_BUILD_PARALLEL_LEVEL=9 \
-pip install . --no-build-isolation -v
-
 say "Exporting Models"
 
 python3 -m examples.portable.scripts.export --model_name="$MODEL_NAME" --segment_alignment=0x4000
-python3 -m examples.apple.coreml.scripts.export_and_delegate --model_name="$MODEL_NAME"
+python3 -m examples.apple.coreml.scripts.export --model_name="$MODEL_NAME"
 python3 -m examples.apple.mps.scripts.mps_example --model_name="$MODEL_NAME"
 python3 -m examples.xnnpack.aot_compiler --model_name="$MODEL_NAME" --delegate
 
@@ -64,7 +56,7 @@ curl https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt \
 
 say "Building Frameworks"
 
-./build/build_apple_frameworks.sh --buck2="$(which buck2)" --flatc="$(which flatc)" --coreml --mps --xnnpack
+./build/build_apple_frameworks.sh --buck2="$(which buck2)" --flatc="$(which flatc)" --coreml --mps --portable --xnnpack
 mv cmake-out "$APP_PATH/Frameworks"
 
 say "Creating Simulator"
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
index 86d47dd561..7d259b2e14 100644
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@@ -62,7 +62,7 @@
 {% block footer %}
 {{ super() }}
 <script script type="text/javascript">
-  var collapsedSections = ['Introduction', 'Getting Started', 'Exporting to ExecuTorch', 'API Reference', 'IR Specification', 'Compiler Entry Points', 'Runtime', 'Quantization', 'Kernel Library', 'Native Delegates', 'SDK', 'Tutorials']
+  var collapsedSections = ['Introduction', 'Getting Started', 'Working with LLMs', 'Exporting to ExecuTorch',  'API Reference', 'IR Specification', 'Compiler Entry Points', 'Runtime', 'Quantization', 'Kernel Library', 'Native Delegates', 'SDK', 'Tutorials']
 </script>
 
 {{ super() }}
diff --git a/docs/source/build-run-coreml.md b/docs/source/build-run-coreml.md
index 66983a8f0f..c442b2cc6b 100644
--- a/docs/source/build-run-coreml.md
+++ b/docs/source/build-run-coreml.md
@@ -59,7 +59,7 @@ xcode-select --install
 cd executorch
 
 # Generates ./mv3_coreml_all.pte file.
-python3 -m examples.apple.coreml.scripts.export_and_delegate --model_name mv3
+python3 -m examples.apple.coreml.scripts.export --model_name mv3
 ```
 
 - Core ML backend uses [coremltools](https://apple.github.io/coremltools/docs-guides/source/overview-coremltools.html) to lower [Edge dialect](ir-exir.md#edge-dialect) to Core ML format and then bundles it in the `.pte` file.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e685f4cebe..871f4aba87 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -79,6 +79,17 @@ Topics in this section will help you get started with ExecuTorch.
    getting-started-setup
    runtime-build-and-cross-compilation
 
+.. toctree::
+   :glob:
+   :maxdepth: 2
+   :caption: Working with LLMs
+   :hidden:
+
+   llm/introduction
+   llm/mobile/index
+   llm/desktop/index
+   llm/advanced-flow/index
+
 .. toctree::
    :glob:
    :maxdepth: 1
diff --git a/docs/source/llm/advanced-flow/advanced-flow.md b/docs/source/llm/advanced-flow/advanced-flow.md
new file mode 100644
index 0000000000..f8b596340f
--- /dev/null
+++ b/docs/source/llm/advanced-flow/advanced-flow.md
@@ -0,0 +1,7 @@
+# Advanced Flows
+
+## Custom quantization
+
+## Bring GGUF to PyTorch ecosystem
+
+## TorchTune interoperability
diff --git a/docs/source/llm/advanced-flow/index.rst b/docs/source/llm/advanced-flow/index.rst
new file mode 100644
index 0000000000..f1ab697bb8
--- /dev/null
+++ b/docs/source/llm/advanced-flow/index.rst
@@ -0,0 +1,9 @@
+Enabling LLMs on Advanced Flow
+=======================
+
+This section will walk you through
+
+.. toctree::
+   :maxdepth: 1
+
+   advanced-flow
diff --git a/docs/source/llm/desktop/benchmarks.md b/docs/source/llm/desktop/benchmarks.md
new file mode 100644
index 0000000000..9a827aca07
--- /dev/null
+++ b/docs/source/llm/desktop/benchmarks.md
@@ -0,0 +1,7 @@
+# Enabling LLMs on Desktop
+
+## Local Llama on Desktop Benchmarks
+
+**Results**
+
+**Instructions**
diff --git a/docs/source/llm/desktop/index.rst b/docs/source/llm/desktop/index.rst
new file mode 100644
index 0000000000..71e77a9f11
--- /dev/null
+++ b/docs/source/llm/desktop/index.rst
@@ -0,0 +1,9 @@
+Enabling LLMs on Desktop
+=======================
+
+This section will walk you through
+
+.. toctree::
+   :maxdepth: 1
+
+   benchmarks
diff --git a/docs/source/llm/introduction.md b/docs/source/llm/introduction.md
new file mode 100644
index 0000000000..a97310eb4a
--- /dev/null
+++ b/docs/source/llm/introduction.md
@@ -0,0 +1,7 @@
+# Introduction
+
+## Current landscape of local LLMs
+
+## What is our offering?
+
+## Why and when should you use it?
diff --git a/docs/source/llm/mobile/benchmarks.md b/docs/source/llm/mobile/benchmarks.md
new file mode 100644
index 0000000000..e056d59d95
--- /dev/null
+++ b/docs/source/llm/mobile/benchmarks.md
@@ -0,0 +1,6 @@
+# Mobile Benchmarks for Local Llama
+
+## Results
+
+
+## Instructions
diff --git a/docs/source/llm/mobile/customization-examples.md b/docs/source/llm/mobile/customization-examples.md
new file mode 100644
index 0000000000..1364304f8d
--- /dev/null
+++ b/docs/source/llm/mobile/customization-examples.md
@@ -0,0 +1,9 @@
+# Customization examples
+
+## Custom tokenization
+
+## Custom sampler
+
+## Speculative decoding
+
+## Modify a mobile app to use a different LLM model
diff --git a/docs/source/llm/mobile/getting-started.md b/docs/source/llm/mobile/getting-started.md
new file mode 100644
index 0000000000..c3d0b11d6d
--- /dev/null
+++ b/docs/source/llm/mobile/getting-started.md
@@ -0,0 +1,16 @@
+# Getting Started with LLMs via ExecuTorch
+
+
+## Simple “Hello World” example
+
+
+## Use Mobile Acceleration
+
+
+## Quantization via XNNPACKQuantizer
+
+
+## Debugging and Profiling
+
+
+## Build Mobile LLM chat App Examples
diff --git a/docs/source/llm/mobile/index.rst b/docs/source/llm/mobile/index.rst
new file mode 100644
index 0000000000..8012c01f8c
--- /dev/null
+++ b/docs/source/llm/mobile/index.rst
@@ -0,0 +1,12 @@
+Enabling LLMs on Mobile
+=======================
+
+This section will walk you through
+
+.. toctree::
+   :maxdepth: 1
+
+   benchmarks
+   getting-started
+   customization-examples
+   validating-other-models
diff --git a/docs/source/llm/mobile/validating-other-models.md b/docs/source/llm/mobile/validating-other-models.md
new file mode 100644
index 0000000000..6097241777
--- /dev/null
+++ b/docs/source/llm/mobile/validating-other-models.md
@@ -0,0 +1,3 @@
+# Validating other models
+
+## Exportability results
diff --git a/docs/source/runtime-build-and-cross-compilation.md b/docs/source/runtime-build-and-cross-compilation.md
index 31d9b38b71..22246b8f8c 100644
--- a/docs/source/runtime-build-and-cross-compilation.md
+++ b/docs/source/runtime-build-and-cross-compilation.md
@@ -140,7 +140,7 @@ Assuming Android NDK is available, run:
 rm -rf cmake-android-out && mkdir cmake-android-out && cd cmake-android-out
 
 # point -DCMAKE_TOOLCHAIN_FILE to the location where ndk is installed
-# Run `which buck2`, if it returns empty (meaning the system doesn't know where buck2 is installed), pass in pass in this flag `-DBUCK2=/path/to/buck2` pointing to buck2
+# Run `which buck2`, if it returns empty (meaning the system doesn't know where buck2 is installed), pass in this flag `-DBUCK2=/path/to/buck2` pointing to buck2
 cmake -DCMAKE_TOOLCHAIN_FILE=/Users/{user_name}/Library/Android/sdk/ndk/25.2.9519653/build/cmake/android.toolchain.cmake  -DANDROID_ABI=arm64-v8a ..
 
 cd  ..
diff --git a/docs/source/tutorials_source/export-to-executorch-tutorial.py b/docs/source/tutorials_source/export-to-executorch-tutorial.py
index b5c5c00f47..49fc2c42b7 100644
--- a/docs/source/tutorials_source/export-to-executorch-tutorial.py
+++ b/docs/source/tutorials_source/export-to-executorch-tutorial.py
@@ -130,11 +130,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 aten_dialect: ExportedProgram = export(f, example_args)
 
 # Works correctly
-print(aten_dialect(torch.ones(3, 3), torch.ones(3, 3)))
+print(aten_dialect.module()(torch.ones(3, 3), torch.ones(3, 3)))
 
 # Errors
 try:
-    print(aten_dialect(torch.ones(3, 2), torch.ones(3, 2)))
+    print(aten_dialect.module()(torch.ones(3, 2), torch.ones(3, 2)))
 except Exception:
     tb.print_exc()
 
@@ -175,18 +175,18 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # Now let's try running the model with different shapes:
 
 # Works correctly
-print(aten_dialect(torch.ones(3, 3), torch.ones(3, 3)))
-print(aten_dialect(torch.ones(3, 2), torch.ones(3, 2)))
+print(aten_dialect.module()(torch.ones(3, 3), torch.ones(3, 3)))
+print(aten_dialect.module()(torch.ones(3, 2), torch.ones(3, 2)))
 
 # Errors because it violates our constraint that input 0, dim 1 <= 10
 try:
-    print(aten_dialect(torch.ones(3, 15), torch.ones(3, 15)))
+    print(aten_dialect.module()(torch.ones(3, 15), torch.ones(3, 15)))
 except Exception:
     tb.print_exc()
 
 # Errors because it violates our constraint that input 0, dim 1 == input 1, dim 1
 try:
-    print(aten_dialect(torch.ones(3, 3), torch.ones(3, 2)))
+    print(aten_dialect.module()(torch.ones(3, 3), torch.ones(3, 2)))
 except Exception:
     tb.print_exc()
 
@@ -287,23 +287,25 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 # there is only one program, it will by default be saved to the name "forward".
 
 
-def encode(x):
-    return torch.nn.functional.linear(x, torch.randn(5, 10))
+class Encode(torch.nn.Module):
+    def forward(self, x):
+        return torch.nn.functional.linear(x, torch.randn(5, 10))
 
 
-def decode(x):
-    return torch.nn.functional.linear(x, torch.randn(10, 5))
+class Decode(torch.nn.Module):
+    def forward(self, x):
+        return torch.nn.functional.linear(x, torch.randn(10, 5))
 
 
 encode_args = (torch.randn(1, 10),)
 aten_encode: ExportedProgram = export(
-    capture_pre_autograd_graph(encode, encode_args),
+    capture_pre_autograd_graph(Encode(), encode_args),
     encode_args,
 )
 
 decode_args = (torch.randn(1, 5),)
 aten_decode: ExportedProgram = export(
-    capture_pre_autograd_graph(decode, decode_args),
+    capture_pre_autograd_graph(Decode(), decode_args),
     decode_args,
 )
 
@@ -486,17 +488,18 @@ def forward(self, x):
 # ``LoweredBackendModule`` for each of those subgraphs.
 
 
-def f(a, x, b):
-    y = torch.mm(a, x)
-    z = y + b
-    a = z - a
-    y = torch.mm(a, x)
-    z = y + b
-    return z
+class Foo(torch.nn.Module):
+    def forward(self, a, x, b):
+        y = torch.mm(a, x)
+        z = y + b
+        a = z - a
+        y = torch.mm(a, x)
+        z = y + b
+        return z
 
 
 example_args = (torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2))
-pre_autograd_aten_dialect = capture_pre_autograd_graph(f, example_args)
+pre_autograd_aten_dialect = capture_pre_autograd_graph(Foo(), example_args)
 aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
 edge_program: EdgeProgramManager = to_edge(aten_dialect)
 exported_program = edge_program.exported_program()
@@ -520,17 +523,18 @@ def f(a, x, b):
 # call ``to_backend`` on it:
 
 
-def f(a, x, b):
-    y = torch.mm(a, x)
-    z = y + b
-    a = z - a
-    y = torch.mm(a, x)
-    z = y + b
-    return z
+class Foo(torch.nn.Module):
+    def forward(self, a, x, b):
+        y = torch.mm(a, x)
+        z = y + b
+        a = z - a
+        y = torch.mm(a, x)
+        z = y + b
+        return z
 
 
 example_args = (torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2))
-pre_autograd_aten_dialect = capture_pre_autograd_graph(f, example_args)
+pre_autograd_aten_dialect = capture_pre_autograd_graph(Foo(), example_args)
 aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args)
 edge_program: EdgeProgramManager = to_edge(aten_dialect)
 exported_program = edge_program.exported_program()
diff --git a/examples/apple/coreml/README.md b/examples/apple/coreml/README.md
index f576476d5c..a10f3efcc9 100644
--- a/examples/apple/coreml/README.md
+++ b/examples/apple/coreml/README.md
@@ -36,7 +36,7 @@ cd executorch
 python3 -m examples.portable.scripts.export -h
 
 # Generates ./add_coreml_all.pte file if successful.
-python3 -m examples.apple.coreml.scripts.export_and_delegate --model_name add
+python3 -m examples.apple.coreml.scripts.export --model_name add
 ```
 
 4. Once we have the **Core ML** delegated model binary (pte) file, then let's run it with the **ExecuTorch** runtime using the `coreml_executor_runner`.
@@ -52,7 +52,7 @@ cd executorch
 ```
 
 ## Frequently encountered errors and resolution.
-- The `examples.apple.coreml.scripts.export_and_delegate` could fail if the model is not supported by the **Core ML** backend. The following models from the examples models list (` python3 -m examples.portable.scripts.export -h`)are currently supported by the **Core ML** backend.
+- The `examples.apple.coreml.scripts.export` could fail if the model is not supported by the **Core ML** backend. The following models from the examples models list (` python3 -m examples.portable.scripts.export -h`)are currently supported by the **Core ML** backend.
 
 ```
 add
diff --git a/examples/apple/coreml/scripts/export_and_delegate.py b/examples/apple/coreml/scripts/export.py
similarity index 80%
rename from examples/apple/coreml/scripts/export_and_delegate.py
rename to examples/apple/coreml/scripts/export.py
index 51f9981210..65d7d840a8 100644
--- a/examples/apple/coreml/scripts/export_and_delegate.py
+++ b/examples/apple/coreml/scripts/export.py
@@ -10,8 +10,14 @@
 
 import executorch.exir as exir
 
+import torch
+
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 
+from executorch.backends.apple.coreml.partition.coreml_partitioner import (
+    CoreMLPartitioner,
+)
+
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.sdk.etrecord import generate_etrecord
@@ -30,6 +36,37 @@
     _check_ir_validity=False,
 )
 
+compute_units = ["cpu_only", "cpu_and_gpu", "cpu_and_ane", "all"]
+
+
+def parse_args() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model_name",
+        required=True,
+        help=f"Provide model name. Valid ones: {list(MODEL_NAME_TO_MODEL.keys())}",
+    )
+
+    parser.add_argument(
+        "-c",
+        "--compute_units",
+        required=False,
+        default="all",
+        help=f"Provide compute units. Valid ones: {compute_units}",
+    )
+    parser.add_argument("--use_partitioner", action=argparse.BooleanOptionalAction)
+    parser.add_argument("--generate_etrecord", action=argparse.BooleanOptionalAction)
+    parser.add_argument("--save_processed_bytes", action=argparse.BooleanOptionalAction)
+
+    args = parser.parse_args()
+    return args
+
+
+def partition_module_to_coreml(module):
+    module = module.eval()
+
 
 def lower_module_to_coreml(module, compute_units):
     module = module.eval()
@@ -81,30 +118,8 @@ def save_processed_bytes(processed_bytes, model_name, compute_units):
     return
 
 
-compute_units = ["cpu_only", "cpu_and_gpu", "cpu_and_ane", "all"]
-
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m",
-        "--model_name",
-        required=True,
-        help=f"Provide model name. Valid ones: {list(MODEL_NAME_TO_MODEL.keys())}",
-    )
-
-    parser.add_argument(
-        "-c",
-        "--compute_units",
-        required=False,
-        default="all",
-        help=f"Provide compute units. Valid ones: {compute_units}",
-    )
-
-    parser.add_argument("--generate_etrecord", action=argparse.BooleanOptionalAction)
-
-    parser.add_argument("--save_processed_bytes", action=argparse.BooleanOptionalAction)
-
-    args = parser.parse_args()
+    args = parse_args()
 
     if args.model_name not in MODEL_NAME_TO_MODEL:
         raise RuntimeError(
@@ -122,15 +137,22 @@ def save_processed_bytes(processed_bytes, model_name, compute_units):
         *MODEL_NAME_TO_MODEL[args.model_name]
     )
 
-    lowered_module, edge_copy = lower_module_to_coreml(
-        model,
-        args.compute_units,
-    )
-
-    exec_program = export_lowered_module_to_executorch_program(
-        lowered_module,
-        example_inputs,
-    )
+    if args.use_partitioner:
+        model.eval()
+        exir_program_aten = torch.export.export(model, example_inputs)
+        edge_program_manager = exir.to_edge(exir_program_aten)
+        edge_copy = copy.deepcopy(edge_program_manager)
+        delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
+        exec_program = delegated_program_manager.to_executorch()
+    else:
+        lowered_module, edge_copy = lower_module_to_coreml(
+            model,
+            args.compute_units,
+        )
+        exec_program = export_lowered_module_to_executorch_program(
+            lowered_module,
+            example_inputs,
+        )
 
     save_executorch_program(exec_program, args.model_name, args.compute_units)
     generate_etrecord(f"{args.model_name}_coreml_etrecord.bin", edge_copy, exec_program)
diff --git a/examples/apple/mps/executor_runner/targets.bzl b/examples/apple/mps/executor_runner/targets.bzl
index cacc81b43e..b14a8105e4 100644
--- a/examples/apple/mps/executor_runner/targets.bzl
+++ b/examples/apple/mps/executor_runner/targets.bzl
@@ -25,7 +25,7 @@ def define_common_targets():
                 "//executorch/runtime/executor:program",
                 "//executorch/extension/evalue_util:print_evalue",
                 "//executorch/extension/data_loader:file_data_loader",
-                "//executorch/kernels/portable:generated_lib_all_ops",
+                "//executorch/kernels/portable:generated_lib",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/sdk/etdump:etdump_flatcc",
                 "//executorch/extension/data_loader:buffer_data_loader",
diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py
index 34f22cc288..866bc85a3a 100644
--- a/examples/apple/mps/scripts/mps_example.py
+++ b/examples/apple/mps/scripts/mps_example.py
@@ -143,13 +143,10 @@
         if not args.use_fp16:
             extension = "fp32"
         model_name = f"{model_name}_{extension}"
-        program_buffer = bundled_program_buffer
-    else:
-        program_buffer = executorch_program.buffer
 
     if args.generate_etrecord:
         etrecord_path = "etrecord.bin"
         logging.info("generating etrecord.bin")
         generate_etrecord(etrecord_path, edge_program_manager_copy, executorch_program)
 
-    save_pte_program(program_buffer, model_name)
+    save_pte_program(executorch_program, model_name)
diff --git a/examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch b/examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch
deleted file mode 100644
index e131ca76ee..0000000000
--- a/examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch
+++ /dev/null
@@ -1,129 +0,0 @@
-From ef07230fbb15edbf27ecaf48994fb157430a5e7c Mon Sep 17 00:00:00 2001
-From: Rob Elliott <robert.elliott@arm.com>
-Date: Thu, 5 Oct 2023 16:45:42 +0000
-Subject: [PATCH] Improve rescale codegen for TOSA
-
-Signed-off-by: Rob Elliott <robert.elliott@arm.com>
----
- ethosu/vela/tosa_graph_optimiser.py | 56 +++++++++++------------------
- ethosu/vela/tosa_mapping.py         |  2 +-
- 2 files changed, 22 insertions(+), 36 deletions(-)
-
-diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
-index df6b575..b2e3697 100644
---- a/ethosu/vela/tosa_graph_optimiser.py
-+++ b/ethosu/vela/tosa_graph_optimiser.py
-@@ -337,7 +337,8 @@ def rewrite_concat(op):
- 
- def remove_memory_ops(op, arch):
-     if op.run_on_npu and op.type in (Op.Reshape, Op.Identity):
--        bypass_memory_only_ops(op)
-+        # TODO: is this ok - function doesn't use arch or nng
-+        bypass_memory_only_ops(op, arch, None)
- 
- 
- def rewrite_activation(op, arch, nng):
-@@ -357,7 +358,6 @@ def rewrite_activation(op, arch, nng):
- 
-     return op
- 
--
- def rewrite_rescale(op, arch, nng):
-     if op.type == Op.Rescale:
-         ifm = op.ifm
-@@ -368,7 +368,7 @@ def rewrite_rescale(op, arch, nng):
-         prev_op = ifm.ops[0]
- 
-         # TODO currently not supported
--        assert len(ifm.consumer_list) == 1
-+        #assert len(ifm.consumer_list) == 1
- 
-         input_zp = op.attrs["input_zp"]
-         output_zp = op.attrs["output_zp"]
-@@ -390,6 +390,9 @@ def rewrite_rescale(op, arch, nng):
-             assert False
-         ifm.quantization.zero_point = input_zp
-         ofm.quantization.zero_point = output_zp
-+
-+        assert False == per_channel, "Don't like per_channel!"
-+        
-         for s, m in zip(shift, multiplier):
-             # TODO these are the TOSA limitations
-             assert m >= 0
-@@ -403,45 +406,28 @@ def rewrite_rescale(op, arch, nng):
-         else:
-             rounding_mode = RoundingMode.HalfUp
- 
--        if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected:
-+        fuse = len(ifm.ops) == 1 and prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op()
-+        if fuse:
-+            # TODO: ERROR: bias.values didn't exist for an op like Add - presumably not a capability of that op
-             assert len(multiplier) == len(shift) == len(prev_op.bias.values)
--
--            if ifm.dtype == DataType.int32 and per_channel:
--                prev_op.explicit_scaling = explicit_scaling
--                prev_op.rounding_mode = rounding_mode
--
--                # Bypass op
--                prev_op.set_output_tensor(ofm)
--                DebugDatabase.add_optimised(op, prev_op)
--                return op
--            else:
--                print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
--                assert False
--        # TODO which are the cases we need to and can do standalone Rescale?
--        # TODO should we try to identify a conversion uint8<->int8 accomplished by 2 RESCALE ops?
--        # origin might be TFLite op QUANTIZE, should we look to see if they can be translated to QUANTIZE?
--        # limited to these at the moment:
--        elif (
--            (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8)
--            or (ifm.dtype == DataType.uint8 and ofm.dtype == DataType.int8)
--            or (ifm.dtype == DataType.int8 and ofm.dtype == DataType.uint8)
--        ):
--            # Create  NOP performing the RESCALE
-+            # TODO: generate replacement fusion code from below
-+            assert False, "Fusion possible but i've not implemented it"
-+        else:
-+            # Generate Rescale behaviour attached to a compatible NOP
-+            # TODO: I assume this attaches a new operator into the graph??
-             avgpool_op = replace_rescale_with_avg_pool(op)
-             avgpool_op.rounding_mode = rounding_mode
--
-+            
-             if per_channel:
--                # TODO
--                avgpool_op.explicit_scaling = explicit_scaling
--                print("Warning, unsupported TOSA Rescale")
--                assert False
-+                assert False, "Assert above removed but still not implemented... :/"
-             else:
-                 avgpool_op.explicit_scaling = explicit_scaling
--        else:
--            print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
--            assert False
--    return op
- 
-+        #print( len(multiplier), len(shift), len(prev_op.get_bias_tensors()) )
-+        #print( ifm.dtype, "PC:", per_channel, op.type )
-+        #print( ifm.dtype, ofm.dtype )
-+            
-+    return op
- 
- def convert_pad_in_width(op):
-     """
-diff --git a/ethosu/vela/tosa_mapping.py b/ethosu/vela/tosa_mapping.py
-index 2dafd81..ed5aa2e 100644
---- a/ethosu/vela/tosa_mapping.py
-+++ b/ethosu/vela/tosa_mapping.py
-@@ -148,7 +148,7 @@ transpose_conv_attrs = AttrSerializer(
- )
- transpose_attrs = AttrSerializer("TransposeAttribute", (("perms", is_vec),))
- axis_attrs = AttrSerializer("AxisAttribute", ("axis",))
--reshape_attrs = AttrSerializer("ReshapeAttribute", (("shape", is_vec),))
-+reshape_attrs = AttrSerializer("ReshapeAttribute", (("newShape", is_vec),))
- slice_attrs = AttrSerializer("SliceAttribute", (("start", is_vec), ("size", is_vec)))
- tile_attrs = AttrSerializer("TileAttribute", (("multiplies", is_vec),))
- resize_attrs = AttrSerializer(
--- 
-2.41.0
-
diff --git a/examples/arm/ethos-u-setup/ethos-u-vela/patches/0002-Use-TOSA-0.80.1_new-12f0e94aca6c17d0c6dc9b463277ab38.patch b/examples/arm/ethos-u-setup/ethos-u-vela/patches/0002-Use-TOSA-0.80.1_new-12f0e94aca6c17d0c6dc9b463277ab38.patch
deleted file mode 100644
index 5d9a560b08..0000000000
--- a/examples/arm/ethos-u-setup/ethos-u-vela/patches/0002-Use-TOSA-0.80.1_new-12f0e94aca6c17d0c6dc9b463277ab38.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-From 394636ef2063e386f54abde094298bb0e40a2cb7 Mon Sep 17 00:00:00 2001
-From: Zingo Andersen <zingo.andersen@arm.com>
-Date: Sat, 20 Jan 2024 10:34:45 +0100
-Subject: [PATCH 2/2] Use TOSA 0.80.1
-
-Signed-off-by: Zingo Andersen <zingo.andersen@arm.com>
----
- ethosu/vela/tosa_reader.py | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/ethosu/vela/tosa_reader.py b/ethosu/vela/tosa_reader.py
-index 56af59d..7cb2bf3 100644
---- a/ethosu/vela/tosa_reader.py
-+++ b/ethosu/vela/tosa_reader.py
-@@ -294,7 +294,7 @@ class TosaGraph:
-     def check_version(self, tosa_graph):
-         version = tosa_graph.Version()
-         version_str = f"{version._Major()}.{version._Minor()}.{version._Patch()}"
--        if version_str != "0.80.0":
-+        if version_str != "0.80.1":
-             print(f"Unsupported TOSA version: {version_str}")
-             assert False
- 
--- 
-2.25.1
-
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 991772166a..d1eeb84173 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -215,7 +215,7 @@ function setup_vela() {
     if [[ ! -e ethos-u-vela ]]; then
         git clone https://review.mlplatform.org/ml/ethos-u/ethos-u-vela
         repo_dir="${root_dir}/ethos-u-vela"
-        base_rev=00a15db3e1a188b25065d095152d701f4394cdc5
+        base_rev=78b9412b07e0a46e58e8ecb9da8d661399c006a5
         patch_repo
     fi
     cd "${root_dir}/ethos-u-vela"
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
index 802e1d9493..cffac7e612 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ b/examples/demo-apps/android/ExecuTorchDemo/README.md
@@ -116,7 +116,7 @@ cmake .. \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI=arm64-v8a \
     -DBUCK2=/tmp/buck2 \
-    -DEXECUTORCH_BUILD_ANDROID_DEMO_APP_JNI=ON \
+    -DEXECUTORCH_BUILD_ANDROID_JNI=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
     -DEXECUTORCH_BUILD_FLATC=OFF \
     -DEXECUTORCH_BUILD_QNN=ON \
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ClassificationActivity.java b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ClassificationActivity.java
index 8c4dd8f8de..93235720b4 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ClassificationActivity.java
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ClassificationActivity.java
@@ -71,7 +71,7 @@ public void run() {
             TensorImageUtils.TORCHVISION_NORM_STD_RGB);
 
     // running the model
-    final Tensor outputTensor = module.forward(EValue.from(inputTensor)).toTensor();
+    final Tensor outputTensor = module.forward(EValue.from(inputTensor))[0].toTensor();
 
     // getting tensor content as java array of floats
     final float[] scores = outputTensor.getDataAsFloatArray();
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
index cdb1ac1983..25f1853e96 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
@@ -189,7 +189,7 @@ public void run() {
     final float[] inputs = inputTensor.getDataAsFloatArray();
 
     final long startTime = SystemClock.elapsedRealtime();
-    Tensor outputTensor = mModule.forward(EValue.from(inputTensor)).toTensor();
+    Tensor outputTensor = mModule.forward(EValue.from(inputTensor))[0].toTensor();
     final long inferenceTime = SystemClock.elapsedRealtime() - startTime;
     Log.d("ImageSegmentation", "inference time (ms): " + inferenceTime);
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
index 3b5e415e19..7297aa60ea 100644
--- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
@@ -12,11 +12,11 @@ plugins {
 }
 
 android {
-  namespace = "com.example.executorchdemo"
+  namespace = "com.example.executorchllamademo"
   compileSdk = 34
 
   defaultConfig {
-    applicationId = "com.example.executorchdemo"
+    applicationId = "com.example.executorchllamademo"
     minSdk = 24
     targetSdk = 33
     versionCode = 1
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
deleted file mode 100644
index c24c367860..0000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.app.Activity;
-import android.app.AlertDialog;
-import android.content.Context;
-import android.os.Bundle;
-import android.widget.Button;
-import android.widget.EditText;
-import android.widget.TextView;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import org.pytorch.executorch.LlamaCallback;
-import org.pytorch.executorch.LlamaModule;
-
-public class MainActivity extends Activity implements Runnable, LlamaCallback {
-  private EditText mEditTextMessage;
-  private TextView mTextViewChat;
-  private Button mSendButton;
-  private Button mStopButton;
-  private Button mModelButton;
-  private LlamaModule mModule = null;
-  private String mResult = null;
-
-  private static String assetFilePath(Context context, String assetName) throws IOException {
-    File file = new File(context.getFilesDir(), assetName);
-    if (file.exists() && file.length() > 0) {
-      return file.getAbsolutePath();
-    }
-
-    try (InputStream is = context.getAssets().open(assetName)) {
-      try (OutputStream os = new FileOutputStream(file)) {
-        byte[] buffer = new byte[4 * 1024];
-        int read;
-        while ((read = is.read(buffer)) != -1) {
-          os.write(buffer, 0, read);
-        }
-        os.flush();
-      }
-      return file.getAbsolutePath();
-    }
-  }
-
-  @Override
-  public void onResult(String result) {
-    System.out.println("onResult: " + result);
-    mResult = result;
-    run();
-  }
-
-  private void setModel(String modelPath, String tokenizerPath) {
-    try {
-      String model = MainActivity.assetFilePath(getApplicationContext(), modelPath);
-      String tokenizer = MainActivity.assetFilePath(getApplicationContext(), tokenizerPath);
-      mModule = new LlamaModule(model, tokenizer, 0.8f);
-    } catch (IOException e) {
-      finish();
-    }
-  }
-
-  private void setLocalModel(String modelPath, String tokenizerPath) {
-    mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f);
-  }
-
-  private void modelDialog() {
-    AlertDialog.Builder builder = new AlertDialog.Builder(this);
-    builder.setTitle("Select a Model");
-    builder.setSingleChoiceItems(
-        new String[] {"stories", "language"},
-        -1,
-        new android.content.DialogInterface.OnClickListener() {
-          public void onClick(android.content.DialogInterface dialog, int item) {
-            switch (item) {
-              case 0:
-                setModel("stories110M.pte", "tokenizer.bin");
-                break;
-              case 1:
-                setLocalModel("/data/local/tmp/language.pte", "/data/local/tmp/language.bin");
-                break;
-            }
-            mEditTextMessage.setText("");
-            mTextViewChat.setText("");
-            dialog.dismiss();
-          }
-        });
-    AlertDialog alert = builder.create();
-    alert.show();
-  }
-
-  @Override
-  protected void onCreate(Bundle savedInstanceState) {
-    super.onCreate(savedInstanceState);
-    setContentView(R.layout.activity_main);
-
-    mEditTextMessage = findViewById(R.id.editTextMessage);
-    mTextViewChat = findViewById(R.id.textViewChat);
-    mSendButton = findViewById(R.id.sendButton);
-    mStopButton = findViewById(R.id.stopButton);
-    mModelButton = findViewById(R.id.modelButton);
-
-    mSendButton.setOnClickListener(
-        view -> {
-          String prompt = mEditTextMessage.getText().toString();
-          mTextViewChat.append(prompt);
-          mEditTextMessage.setText("");
-          Runnable runnable =
-              new Runnable() {
-                @Override
-                public void run() {
-                  mModule.generate(prompt, MainActivity.this);
-                }
-              };
-          new Thread(runnable).start();
-        });
-
-    mStopButton.setOnClickListener(
-        view -> {
-          mModule.stop();
-        });
-
-    mModelButton.setOnClickListener(
-        view -> {
-          mModule.stop();
-          modelDialog();
-        });
-
-    setModel("stories110M.pte", "tokenizer.bin");
-  }
-
-  @Override
-  public void run() {
-    runOnUiThread(
-        new Runnable() {
-          @Override
-          public void run() {
-            mTextViewChat.append(mResult);
-          }
-        });
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
new file mode 100644
index 0000000000..dd7cbfe50f
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.app.Activity;
+import android.app.ActivityManager;
+import android.app.AlertDialog;
+import android.content.Context;
+import android.os.Bundle;
+import android.widget.Button;
+import android.widget.EditText;
+import android.widget.ImageButton;
+import android.widget.ListView;
+import java.io.File;
+import org.pytorch.executorch.LlamaCallback;
+import org.pytorch.executorch.LlamaModule;
+
+public class MainActivity extends Activity implements Runnable, LlamaCallback {
+  private EditText mEditTextMessage;
+  private Button mSendButton;
+  private ImageButton mModelButton;
+  private ListView mMessagesView;
+  private MessageAdapter mMessageAdapter;
+  private LlamaModule mModule = null;
+  private Message mResultMessage = null;
+
+  private int mNumTokens = 0;
+  private long mRunStartTime = 0;
+  private String mModelFilePath = "";
+  private String mTokenizerFilePath = "";
+
+  @Override
+  public void onResult(String result) {
+    System.out.println("onResult: " + result);
+    mResultMessage.appendText(result);
+    mNumTokens++;
+    run();
+  }
+
+  private static String[] listLocalFile(String path, String suffix) {
+    File directory = new File(path);
+    if (directory.exists() && directory.isDirectory()) {
+      File[] files = directory.listFiles((dir, name) -> name.toLowerCase().endsWith(suffix));
+      String[] result = new String[files.length];
+      for (int i = 0; i < files.length; i++) {
+        if (files[i].isFile() && files[i].getName().endsWith(suffix)) {
+          result[i] = files[i].getAbsolutePath();
+        }
+      }
+      return result;
+    }
+    return null;
+  }
+
+  private void setLocalModel(String modelPath, String tokenizerPath) {
+    long runStartTime = System.currentTimeMillis();
+    mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f);
+    int loadResult = mModule.load();
+    if (loadResult != 0) {
+      AlertDialog.Builder builder = new AlertDialog.Builder(this);
+      builder.setTitle("Load failed: " + loadResult);
+      AlertDialog alert = builder.create();
+      alert.show();
+    }
+
+    long runDuration = System.currentTimeMillis() - runStartTime;
+    String modelInfo =
+        "Model path: "
+            + modelPath
+            + "\nTokenizer path: "
+            + tokenizerPath
+            + "\nModel loaded time: "
+            + runDuration
+            + " ms";
+    Message modelLoadedMessage = new Message(modelInfo, false);
+    mMessageAdapter.add(modelLoadedMessage);
+    mMessageAdapter.notifyDataSetChanged();
+  }
+
+  private String memoryInfo() {
+    final ActivityManager am = (ActivityManager) getSystemService(Context.ACTIVITY_SERVICE);
+    ActivityManager.MemoryInfo memInfo = new ActivityManager.MemoryInfo();
+    am.getMemoryInfo(memInfo);
+    return "Total RAM: "
+        + Math.floorDiv(memInfo.totalMem, 1000000)
+        + " MB. Available RAM: "
+        + Math.floorDiv(memInfo.availMem, 1000000)
+        + " MB.";
+  }
+
+  private void modelDialog() {
+    String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte");
+    String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin");
+    AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this);
+    modelPathBuilder.setTitle("Select model path");
+    AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this);
+    tokenizerPathBuilder.setTitle("Select tokenizer path");
+    modelPathBuilder.setSingleChoiceItems(
+        binFiles,
+        -1,
+        (dialog, item) -> {
+          mTokenizerFilePath = binFiles[item];
+          mEditTextMessage.setText("");
+          dialog.dismiss();
+          tokenizerPathBuilder.create().show();
+        });
+
+    tokenizerPathBuilder.setSingleChoiceItems(
+        pteFiles,
+        -1,
+        (dialog, item) -> {
+          mModelFilePath = pteFiles[item];
+          setLocalModel(mModelFilePath, mTokenizerFilePath);
+          dialog.dismiss();
+        });
+
+    modelPathBuilder.create().show();
+  }
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_main);
+
+    mEditTextMessage = findViewById(R.id.editTextMessage);
+    mSendButton = findViewById(R.id.sendButton);
+    mModelButton = findViewById(R.id.modelButton);
+    mMessagesView = findViewById(R.id.messages_view);
+    mMessageAdapter = new MessageAdapter(this, R.layout.sent_message);
+    mMessagesView.setAdapter(mMessageAdapter);
+    mModelButton.setOnClickListener(
+        view -> {
+          mModule.stop();
+          mMessageAdapter.clear();
+          mMessageAdapter.notifyDataSetChanged();
+          modelDialog();
+        });
+
+    setLocalModel("/data/local/tmp/llama/stories110M.pte", "/data/local/tmp/llama/tokenizer.bin");
+    onModelRunStopped();
+  }
+
+  private void onModelRunStarted() {
+    mSendButton.setText("Stop");
+    mSendButton.setOnClickListener(
+        view -> {
+          mModule.stop();
+        });
+
+    mRunStartTime = System.currentTimeMillis();
+  }
+
+  private void onModelRunStopped() {
+    setTitle(memoryInfo());
+    long runDuration = System.currentTimeMillis() - mRunStartTime;
+    if (mResultMessage != null) {
+      mResultMessage.setTokensPerSecond(1.0f * mNumTokens / (runDuration / 1000.0f));
+    }
+    mSendButton.setText("Generate");
+    mSendButton.setOnClickListener(
+        view -> {
+          String prompt = mEditTextMessage.getText().toString();
+          mMessageAdapter.add(new Message(prompt, true));
+          mMessageAdapter.notifyDataSetChanged();
+          mEditTextMessage.setText("");
+          mResultMessage = new Message("", false);
+          mMessageAdapter.add(mResultMessage);
+          Runnable runnable =
+              new Runnable() {
+                @Override
+                public void run() {
+                  runOnUiThread(
+                      new Runnable() {
+                        @Override
+                        public void run() {
+                          onModelRunStarted();
+                        }
+                      });
+
+                  mModule.generate(prompt, MainActivity.this);
+
+                  runOnUiThread(
+                      new Runnable() {
+                        @Override
+                        public void run() {
+                          onModelRunStopped();
+                        }
+                      });
+                }
+              };
+          new Thread(runnable).start();
+        });
+    mNumTokens = 0;
+    mRunStartTime = 0;
+    mMessageAdapter.notifyDataSetChanged();
+  }
+
+  @Override
+  public void run() {
+    runOnUiThread(
+        new Runnable() {
+          @Override
+          public void run() {
+            mMessageAdapter.notifyDataSetChanged();
+            setTitle(memoryInfo());
+          }
+        });
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java
new file mode 100644
index 0000000000..81b77b1aba
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+public class Message {
+  private String text;
+  private boolean isSent;
+  private float tokensPerSecond;
+
+  public Message(String text, boolean isSent) {
+    this.text = text;
+    this.isSent = isSent;
+  }
+
+  public String getText() {
+    return text;
+  }
+
+  public void appendText(String text) {
+    this.text += text;
+  }
+
+  public boolean getIsSent() {
+    return isSent;
+  }
+
+  public void setTokensPerSecond(float tokensPerSecond) {
+    this.tokensPerSecond = tokensPerSecond;
+  }
+
+  public float getTokensPerSecond() {
+    return tokensPerSecond;
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
new file mode 100644
index 0000000000..656da1967d
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.view.LayoutInflater;
+import android.view.View;
+import android.view.ViewGroup;
+import android.widget.ArrayAdapter;
+import android.widget.TextView;
+
+public class MessageAdapter extends ArrayAdapter<Message> {
+  public MessageAdapter(android.content.Context context, int resource) {
+    super(context, resource);
+  }
+
+  @Override
+  public View getView(int position, View convertView, ViewGroup parent) {
+    Message currentMessage = getItem(position);
+
+    int layoutIdForListItem =
+        currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message;
+    View listItemView =
+        LayoutInflater.from(getContext()).inflate(layoutIdForListItem, parent, false);
+    TextView messageTextView = listItemView.findViewById(R.id.message_text);
+    messageTextView.setText(currentMessage.getText());
+
+    if (currentMessage.getTokensPerSecond() > 0) {
+      TextView tokensView = listItemView.findViewById(R.id.tokens_per_second);
+      tokensView.setText("" + currentMessage.getTokensPerSecond() + " t/s");
+    }
+
+    return listItemView;
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml
new file mode 100644
index 0000000000..ea2d1bbfa1
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+    android:shape="rectangle">
+    <solid android:color="#fff" />
+    <corners android:radius="10dp" />
+</shape>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml
new file mode 100644
index 0000000000..e8d13ca4e1
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+    android:shape="rectangle">
+    <solid android:color="@color/colorPrimary" />
+    <corners android:radius="10dp" />
+</shape>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml
new file mode 100644
index 0000000000..afbe22da80
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml
@@ -0,0 +1,5 @@
+<vector android:height="24dp" android:tint="#000000"
+    android:viewportHeight="24" android:viewportWidth="24"
+    android:width="24dp" xmlns:android="http://schemas.android.com/apk/res/android">
+    <path android:fillColor="@android:color/white" android:pathData="M6,10c-1.1,0 -2,0.9 -2,2s0.9,2 2,2 2,-0.9 2,-2 -0.9,-2 -2,-2zM18,10c-1.1,0 -2,0.9 -2,2s0.9,2 2,2 2,-0.9 2,-2 -0.9,-2 -2,-2zM12,10c-1.1,0 -2,0.9 -2,2s0.9,2 2,2 2,-0.9 2,-2 -0.9,-2 -2,-2z"/>
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
index f769578d33..089acb572b 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
@@ -1,44 +1,44 @@
 <?xml version="1.0" encoding="utf-8"?>
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
     xmlns:tools="http://schemas.android.com/tools"
     android:layout_width="match_parent"
     android:layout_height="match_parent"
+    android:orientation="vertical"
+    android:clipToPadding="false"
+    android:focusableInTouchMode="true"
     tools:context=".MainActivity">
-    <EditText
-        android:id="@+id/editTextMessage"
+
+    <ListView
         android:layout_width="match_parent"
+        android:id="@+id/messages_view"
+        android:layout_weight="2"
+        android:divider="#fff"
         android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true"
-        android:textSize="20sp"
-        android:hint="Type a prompt" />
-    <LinearLayout
+        />
+    <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
         android:layout_width="match_parent"
         android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true"
-        android:orientation="horizontal"
-        android:gravity="right"
-        tools:ignore="RtlHardcoded">
-        <Button
+        android:background="#fff"
+        android:orientation="horizontal">
+        <ImageButton
             android:id="@+id/modelButton"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:text="Model..." />
-        <Button
-            android:id="@+id/stopButton"
-            android:layout_width="wrap_content"
+            android:src="@drawable/three_dots" />
+        <EditText
+            android:id="@+id/editTextMessage"
+            android:layout_width="match_parent"
             android:layout_height="wrap_content"
-            android:text="Stop" />
+            android:layout_weight="2"
+            android:ems="10"
+            android:hint="Prompt"
+            android:inputType="text"
+            android:paddingHorizontal="10dp"
+            android:text="" />
         <Button
             android:id="@+id/sendButton"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
             android:text="Generate" />
     </LinearLayout>
-    <TextView
-        android:id="@+id/textViewChat"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:layout_above="@id/editTextMessage"
-        android:textSize="24sp"
-        android:scrollbars="vertical" />
-</RelativeLayout>
+</LinearLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
new file mode 100644
index 0000000000..770a257ca8
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="utf-8"?>
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:paddingVertical="10dp"
+    android:paddingLeft="15dp"
+    android:paddingRight="60dp"
+    android:clipToPadding="false">
+
+    <TextView
+        android:id="@+id/name"
+        android:layout_marginLeft="15dp"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:paddingBottom="4dp"
+        android:text="LLaMA"/>
+
+    <TextView
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:id="@+id/message_text"
+        android:layout_below="@+id/name"
+        android:layout_alignLeft="@+id/name"
+        android:background="@drawable/received_message"
+        android:paddingVertical="12dp"
+        android:paddingHorizontal="16dp"
+        android:elevation="2dp"
+        android:textSize="18dp"
+        android:text="Generated text"
+        />
+
+
+    <TextView
+        android:id="@+id/tokens_per_second"
+        android:layout_marginLeft="15dp"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_below="@+id/message_text"
+        android:paddingBottom="4dp"
+        android:text=""/>
+
+</RelativeLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
new file mode 100644
index 0000000000..b8121e973e
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:paddingVertical="10dp"
+    android:paddingRight="15dp"
+    android:paddingLeft="60dp"
+    android:clipToPadding="false">
+
+    <TextView
+        android:id="@+id/name"
+        android:layout_marginRight="15dp"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:paddingBottom="4dp"
+        android:layout_alignParentRight="true"
+        android:text="Prompt"/>
+
+    <TextView
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:id="@+id/message_text"
+        android:layout_below="@+id/name"
+        android:layout_alignRight="@+id/name"
+        android:background="@drawable/sent_message"
+        android:textColor="#fff"
+        android:padding="10dp"
+        android:elevation="2dp"
+        android:textSize="18dp"
+        android:layout_alignParentRight="true"
+        android:text="My prompt"
+        />
+
+</RelativeLayout>
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ContentView.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ContentView.swift
index f4788bdd31..9005a20d1a 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ContentView.swift
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/App/ContentView.swift
@@ -33,7 +33,7 @@ struct ContentView: View {
       TopBar(title: "ExecuTorch Demo")
       ClassificationLabelView(controller: classificationController)
       Spacer()
-      ClassificationTimeView(controller: classificationController).hidden()
+      ClassificationTimeView(controller: classificationController)
       ModeSelector(controller: classificationController)
     }
   }
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.swift b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.swift
index 15e680045e..b7a6e30a28 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.swift
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.swift
@@ -6,9 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+import ExecuTorch
 import ImageClassification
 import UIKit
 
+import os.log
+
 public enum MobileNetClassifierError: Error {
   case inputPointer
   case rawData
@@ -43,6 +46,16 @@ public class MobileNetClassifier: ImageClassification {
     mobileNetClassifier = ETMobileNetClassifier(filePath: modelFilePath)
     rawDataBuffer = [UInt8](repeating: 0, count: Int(Self.cropSize * Self.cropSize) * 4)
     normalizedBuffer = [Float](repeating: 0, count: rawDataBuffer.count / 4 * 3)
+
+    #if DEBUG
+    Log.shared.add(sink: self)
+    #endif
+  }
+
+  deinit {
+    #if DEBUG
+    Log.shared.remove(sink: self)
+    #endif
   }
 
   public func classify(image: UIImage) throws -> [Classification] {
@@ -133,3 +146,24 @@ public class MobileNetClassifier: ImageClassification {
     return expInput.map { $0 / sumExpInput }
   }
 }
+
+#if DEBUG
+extension MobileNetClassifier: LogSink {
+  public func log(level: LogLevel, timestamp: TimeInterval, filename: String, line: UInt, message: String) {
+    let logMessage = "executorch:\(filename):\(line) \(message)"
+
+    switch level {
+    case .debug:
+      os_log(.debug, "%{public}@", logMessage)
+    case .info:
+      os_log(.info, "%{public}@", logMessage)
+    case .error:
+      os_log(.error, "%{public}@", logMessage)
+    case .fatal:
+      os_log(.fault, "%{public}@", logMessage)
+    default:
+      os_log("%{public}@", logMessage)
+    }
+  }
+}
+#endif
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/App.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/App.swift
new file mode 100644
index 0000000000..442e320f30
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/App.swift
@@ -0,0 +1,12 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import SwiftUI
+
+@main
+struct App: SwiftUI.App {
+  var body: some Scene {
+    WindowGroup {
+      ContentView()
+    }
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
new file mode 100644
index 0000000000..456e8369bb
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
@@ -0,0 +1,273 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import SwiftUI
+import UniformTypeIdentifiers
+
+import LLaMARunner
+
+class RunnerHolder: ObservableObject {
+  var runner: Runner?
+}
+
+struct ContentView: View {
+  @State private var prompt = ""
+  @State private var messages: [Message] = []
+  @State private var showingLogs = false
+  @State private var pickerType: PickerType?
+  @State private var isGenerating = false
+  @State private var shouldStopGenerating = false
+  private let runnerQueue = DispatchQueue(label: "org.pytorch.executorch.llama")
+  @StateObject private var runnerHolder = RunnerHolder()
+  @StateObject private var resourceManager = ResourceManager()
+  @StateObject private var resourceMonitor = ResourceMonitor()
+  @StateObject private var logManager = LogManager()
+
+  enum PickerType {
+    case model
+    case tokenizer
+  }
+
+  private var placeholder: String {
+    resourceManager.isModelValid ? resourceManager.isTokenizerValid ? "Prompt..." : "Select Tokenizer..." : "Select Model..."
+  }
+
+  private var title: String {
+    resourceManager.isModelValid ? resourceManager.isTokenizerValid ? resourceManager.modelName : "Select Tokenizer..." : "Select Model..."
+  }
+
+  private var modelTitle: String {
+    resourceManager.isModelValid ? resourceManager.modelName : "Select Model..."
+  }
+
+  private var tokenizerTitle: String {
+    resourceManager.isTokenizerValid ? resourceManager.tokenizerName : "Select Tokenizer..."
+  }
+
+  private var isInputEnabled: Bool { resourceManager.isModelValid && resourceManager.isTokenizerValid }
+
+  var body: some View {
+    NavigationView {
+      VStack {
+        MessageListView(messages: $messages)
+          .gesture(
+            DragGesture().onChanged { value in
+              if value.translation.height > 10 {
+                UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
+              }
+            }
+          )
+        HStack {
+          Menu {
+            Section(header: Text("Model")) {
+              Button(action: { pickerType = .model }) {
+                Label(modelTitle, systemImage: "doc")
+              }
+            }
+            Section(header: Text("Tokenizer")) {
+              Button(action: { pickerType = .tokenizer }) {
+                Label(tokenizerTitle, systemImage: "doc")
+              }
+            }
+          } label: {
+            Image(systemName: "ellipsis.circle")
+              .resizable()
+              .aspectRatio(contentMode: .fit)
+              .frame(height: 28)
+          }
+          .disabled(isGenerating)
+
+          TextField(placeholder, text: $prompt, axis: .vertical)
+            .padding(8)
+            .background(Color.gray.opacity(0.1))
+            .cornerRadius(20)
+            .lineLimit(1...10)
+            .overlay(
+              RoundedRectangle(cornerRadius: 20)
+                .stroke(isInputEnabled ? Color.blue : Color.gray, lineWidth: 1)
+            )
+            .disabled(!isInputEnabled)
+
+          Button(action: isGenerating ? stop : generate) {
+            Image(systemName: isGenerating ? "stop.circle" : "arrowshape.up.circle.fill")
+              .resizable()
+              .aspectRatio(contentMode: .fit)
+              .frame(height: 28)
+          }
+          .disabled(isGenerating ? shouldStopGenerating : (!isInputEnabled || prompt.isEmpty))
+        }
+        .padding([.leading, .trailing, .bottom], 10)
+      }
+      .navigationBarTitle(title, displayMode: .inline)
+      .navigationBarItems(trailing:
+                            HStack {
+                              Menu {
+                                Section(header: Text("Memory")) {
+                                  Text("Used: \(resourceMonitor.usedMemory) Mb")
+                                  Text("Available: \(resourceMonitor.availableMemory) Mb")
+                                }
+                              } label: {
+                                Text("\(resourceMonitor.usedMemory) Mb")
+                              }
+                              .onAppear {
+                                resourceMonitor.start()
+                              }
+                              .onDisappear {
+                                resourceMonitor.stop()
+                              }
+                              Button(action: { showingLogs = true }) {
+                                Image(systemName: "list.bullet.rectangle")
+                              }
+                            }
+      )
+      .sheet(isPresented: $showingLogs) {
+        NavigationView {
+          LogView(logManager: logManager)
+        }
+      }
+      .fileImporter(
+        isPresented: Binding<Bool>(
+          get: { pickerType != nil },
+          set: { if !$0 { pickerType = nil } }
+        ),
+        allowedContentTypes: allowedContentTypes(),
+        allowsMultipleSelection: false
+      ) { [pickerType] result in
+        handleFileImportResult(pickerType, result)
+      }
+      .onAppear {
+        do {
+          try resourceManager.createDirectoriesIfNeeded()
+        } catch {
+          withAnimation {
+            messages.append(Message(type: .info, text: "Error creating content directories: \(error.localizedDescription)"))
+          }
+        }
+      }
+    }
+  }
+
+  private func generate() {
+    guard !prompt.isEmpty else { return }
+    isGenerating = true
+    shouldStopGenerating = false
+    let text = prompt.trimmingCharacters(in: .whitespacesAndNewlines)
+    let seq_len = 128
+    prompt = ""
+    let modelPath = resourceManager.modelPath
+    let tokenizerPath = resourceManager.tokenizerPath
+
+    messages.append(Message(text: text))
+    messages.append(Message(type: .generated))
+
+    runnerQueue.async {
+      defer {
+        DispatchQueue.main.async {
+          isGenerating = false
+        }
+      }
+      runnerHolder.runner = runnerHolder.runner ?? Runner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+      guard !shouldStopGenerating else { return }
+      if let runner = runnerHolder.runner, !runner.isloaded() {
+        var error: Error?
+        let startLoadTime = Date()
+        do {
+          try runner.load()
+        } catch let loadError {
+          error = loadError
+        }
+        let loadTime = Date().timeIntervalSince(startLoadTime)
+        DispatchQueue.main.async {
+          withAnimation {
+            var message = messages.removeLast()
+            message.type = .info
+            if let error {
+              message.text = "Model loading failed: error \((error as NSError).code)"
+            } else {
+              message.text = "Model loaded in \(String(format: "%.1f", loadTime)) s"
+            }
+            messages.append(message)
+            if error == nil {
+              messages.append(Message(type: .generated))
+            }
+          }
+        }
+        if error != nil {
+          return
+        }
+      }
+      guard !shouldStopGenerating else {
+        DispatchQueue.main.async {
+          withAnimation {
+            _ = messages.removeLast()
+          }
+        }
+        return
+      }
+      do {
+        try runnerHolder.runner?.generate(text, sequenceLength: seq_len) { token in
+
+          DispatchQueue.main.async {
+            withAnimation {
+              var message = messages.removeLast()
+              message.text += token
+              message.tokenCount += 1
+              message.dateUpdated = Date()
+              messages.append(message)
+            }
+          }
+          if shouldStopGenerating {
+            runnerHolder.runner?.stop()
+          }
+        }
+      } catch {
+        DispatchQueue.main.async {
+          withAnimation {
+            var message = messages.removeLast()
+            message.type = .info
+            message.text = "Text generation failed: error \((error as NSError).code)"
+            messages.append(message)
+          }
+        }
+      }
+    }
+  }
+
+  private func stop() {
+    shouldStopGenerating = true
+  }
+
+  private func allowedContentTypes() -> [UTType] {
+    guard let pickerType else { return [] }
+    switch pickerType {
+    case .model:
+      return [UTType(filenameExtension: "pte")].compactMap { $0 }
+    case .tokenizer:
+      return [UTType(filenameExtension: "bin")].compactMap { $0 }
+    }
+  }
+
+  private func handleFileImportResult(_ pickerType: PickerType?, _ result: Result<[URL], Error>) {
+    switch result {
+    case .success(let urls):
+      guard let url = urls.first, let pickerType else {
+        withAnimation {
+          messages.append(Message(type: .info, text: "Failed to select a file"))
+        }
+        return
+      }
+      runnerQueue.async {
+        runnerHolder.runner = nil
+      }
+      switch pickerType {
+      case .model:
+        resourceManager.modelPath = url.path
+      case .tokenizer:
+        resourceManager.tokenizerPath = url.path
+      }
+    case .failure(let error):
+      withAnimation {
+        messages.append(Message(type: .info, text: "Failed to select a file: \(error.localizedDescription)"))
+      }
+    }
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/LogManager.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/LogManager.swift
new file mode 100644
index 0000000000..d7c58049a5
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/LogManager.swift
@@ -0,0 +1,45 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import SwiftUI
+
+import ExecuTorch
+
+struct LogEntry: Identifiable, Codable {
+  let id: UUID
+  let level: Int
+  let timestamp: TimeInterval
+  let filename: String
+  let line: UInt
+  let message: String
+}
+
+class LogManager: ObservableObject, LogSink {
+  @AppStorage("logs") private var data = Data()
+
+  @Published var logs: [LogEntry] = [] {
+    didSet {
+      data = (try? JSONEncoder().encode(logs)) ?? Data()
+    }
+  }
+
+  init() {
+    logs = (try? JSONDecoder().decode([LogEntry].self, from: data)) ?? []
+    Log.shared.add(sink: self)
+  }
+
+  deinit {
+    Log.shared.remove(sink: self)
+  }
+
+  func log(level: LogLevel, timestamp: TimeInterval, filename: String, line: UInt, message: String) {
+    let log = LogEntry(id: UUID(), level: level.rawValue, timestamp: timestamp, filename: filename, line: line, message: message)
+
+    DispatchQueue.main.sync {
+      self.logs.append(log)
+    }
+  }
+
+  func clear() {
+    logs.removeAll()
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/LogView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/LogView.swift
new file mode 100644
index 0000000000..bef0c89edd
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/LogView.swift
@@ -0,0 +1,58 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import SwiftUI
+
+import ExecuTorch
+
+struct LogView: View {
+  @ObservedObject var logManager: LogManager
+
+  var body: some View {
+    ScrollView {
+      VStack(alignment: .leading) {
+        ForEach(logManager.logs) { log in
+          Text("\(format(timestamp: log.timestamp)) \(log.filename):\(log.line)")
+            .padding(.top)
+            .foregroundColor(.secondary)
+            .textSelection(.enabled)
+          Text(log.message)
+            .padding(.bottom)
+            .foregroundColor(color(for: log.level))
+            .textSelection(.enabled)
+        }
+      }
+    }
+    .padding()
+    .defaultScrollAnchor(.bottom)
+    .navigationBarTitle("Logs", displayMode: .inline)
+    .navigationBarItems(trailing:
+                          Button(action: { logManager.clear() }) {
+                            Image(systemName: "trash")
+                          }
+    )
+  }
+
+  private func format(timestamp: TimeInterval) -> String {
+    let totalSeconds = Int(timestamp)
+    let hours = (totalSeconds / 3600) % 24
+    let minutes = (totalSeconds / 60) % 60
+    let seconds = totalSeconds % 60
+    let microseconds = Int((timestamp - Double(totalSeconds)) * 1000000)
+    return String(format: "%02d:%02d:%02d.%06d", hours, minutes, seconds, microseconds)
+  }
+
+  private func color(for level: Int) -> Color {
+    switch LogLevel(rawValue: level) {
+    case .debug:
+      return .blue
+    case .info:
+      return .primary
+    case .error:
+      return .red
+    case .fatal:
+      return .purple
+    default:
+      return .secondary
+    }
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift
new file mode 100644
index 0000000000..1c6cdac05c
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Message.swift
@@ -0,0 +1,18 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+enum MessageType {
+  case prompted
+  case generated
+  case info
+}
+
+struct Message: Identifiable, Equatable {
+  let id = UUID()
+  let dateCreated = Date()
+  var dateUpdated = Date()
+  var type: MessageType = .prompted
+  var text = ""
+  var tokenCount = 0
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageListView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageListView.swift
new file mode 100644
index 0000000000..7b6da0d73f
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageListView.swift
@@ -0,0 +1,82 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import SwiftUI
+
+struct MessageListView: View {
+  @Binding var messages: [Message]
+  @State private var showScrollToBottomButton = false
+  @State private var userHasScrolled = false
+  @State private var keyboardHeight: CGFloat = 0
+
+  var body: some View {
+    ScrollViewReader { value in
+      ScrollView {
+        VStack {
+          ForEach(messages) { message in
+            MessageView(message: message)
+              .padding([.leading, .trailing], 20)
+          }
+          GeometryReader { geometry -> Color in
+            DispatchQueue.main.async {
+              let maxY = geometry.frame(in: .global).maxY
+              let screenHeight = UIScreen.main.bounds.height - keyboardHeight
+              let isBeyondBounds = maxY > screenHeight - 50
+              if showScrollToBottomButton != isBeyondBounds {
+                showScrollToBottomButton = isBeyondBounds
+                userHasScrolled = isBeyondBounds
+              }
+            }
+            return Color.clear
+          }
+          .frame(height: 0)
+        }
+      }
+      .onChange(of: messages) {
+        if !userHasScrolled, let lastMessageId = messages.last?.id {
+          withAnimation {
+            value.scrollTo(lastMessageId, anchor: .bottom)
+          }
+        }
+      }
+      .overlay(
+        Group {
+          if showScrollToBottomButton {
+            Button(action: {
+              withAnimation {
+                if let lastMessageId = messages.last?.id {
+                  value.scrollTo(lastMessageId, anchor: .bottom)
+                }
+                userHasScrolled = false
+              }
+            }) {
+              ZStack {
+                Circle()
+                  .fill(Color(UIColor.secondarySystemBackground).opacity(0.9))
+                  .frame(height: 28)
+                Image(systemName: "arrow.down.circle")
+                  .resizable()
+                  .aspectRatio(contentMode: .fit)
+                  .frame(height: 28)
+              }
+            }
+            .transition(AnyTransition.opacity.animation(.easeInOut(duration: 0.2)))
+          }
+        },
+        alignment: .bottom
+      )
+    }
+    .onAppear {
+      NotificationCenter.default.addObserver(forName: UIResponder.keyboardWillShowNotification, object: nil, queue: .main) { notification in
+        let keyboardFrame = notification.userInfo?[UIResponder.keyboardFrameEndUserInfoKey] as? CGRect ?? .zero
+        keyboardHeight = keyboardFrame.height - 40
+      }
+      NotificationCenter.default.addObserver(forName: UIResponder.keyboardWillHideNotification, object: nil, queue: .main) { _ in
+        keyboardHeight = 0
+      }
+    }
+    .onDisappear {
+      NotificationCenter.default.removeObserver(self, name: UIResponder.keyboardWillShowNotification, object: nil)
+      NotificationCenter.default.removeObserver(self, name: UIResponder.keyboardWillHideNotification, object: nil)
+    }
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift
new file mode 100644
index 0000000000..490e7c9be3
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/MessageView.swift
@@ -0,0 +1,54 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import SwiftUI
+
+struct MessageView: View {
+  let message: Message
+
+  var body: some View {
+    VStack(alignment: .center) {
+      if message.type == .info {
+        Text(message.text)
+          .font(.caption)
+          .foregroundColor(.secondary)
+          .padding([.leading, .trailing], 10)
+      } else {
+        VStack(alignment: message.type == .generated ? .leading : .trailing) {
+          Text(message.type == .generated ? "LLaMA" : "Prompt")
+            .font(.caption)
+            .foregroundColor(.secondary)
+            .padding(message.type == .generated ? .trailing : .leading, 20)
+          HStack {
+            if message.type != .generated { Spacer() }
+            if message.text.isEmpty {
+              ProgressView()
+                .progressViewStyle(CircularProgressViewStyle())
+            } else {
+              Text(message.text)
+                .padding(10)
+                .foregroundColor(message.type == .generated ? .primary : .white)
+                .background(message.type == .generated ? Color(UIColor.secondarySystemBackground) : Color.blue)
+                .cornerRadius(20)
+                .contextMenu {
+                  Button(action: {
+                    UIPasteboard.general.string = message.text
+                  }) {
+                    Text("Copy")
+                    Image(systemName: "doc.on.doc")
+                  }
+                }
+            }
+            if message.type == .generated { Spacer() }
+          }
+          let elapsedTime = message.dateUpdated.timeIntervalSince(message.dateCreated)
+          if elapsedTime > 0 && message.type != .info {
+            Text(String(format: "%.1f t/s", Double(message.tokenCount) / elapsedTime))
+              .font(.caption)
+              .foregroundColor(.secondary)
+              .padding(message.type == .generated ? .trailing : .leading, 20)
+          }
+        }.padding([.leading, .trailing], message.type == .info ? 0 : 10)
+      }
+    }
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceManager.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceManager.swift
new file mode 100644
index 0000000000..338868a3ff
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceManager.swift
@@ -0,0 +1,31 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import SwiftUI
+
+final class ResourceManager: ObservableObject {
+  @AppStorage("modelPath") var modelPath = ""
+  @AppStorage("tokenizerPath") var tokenizerPath = ""
+  private let fileManager = FileManager.default
+
+  var isModelValid: Bool {
+    fileManager.fileExists(atPath: modelPath)
+  }
+
+  var isTokenizerValid: Bool {
+    fileManager.fileExists(atPath: tokenizerPath)
+  }
+
+  var modelName: String {
+    URL(fileURLWithPath: modelPath).deletingPathExtension().lastPathComponent
+  }
+
+  var tokenizerName: String {
+    URL(fileURLWithPath: tokenizerPath).deletingPathExtension().lastPathComponent
+  }
+
+  func createDirectoriesIfNeeded() throws {
+    guard let documentsDirectory = fileManager.urls(for: .documentDirectory, in: .userDomainMask).first else { return }
+    try fileManager.createDirectory(at: documentsDirectory.appendingPathComponent("models"), withIntermediateDirectories: true, attributes: nil)
+    try fileManager.createDirectory(at: documentsDirectory.appendingPathComponent("tokenizers"), withIntermediateDirectories: true, attributes: nil)
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceMonitor.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceMonitor.swift
new file mode 100644
index 0000000000..5f0a7d6a59
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceMonitor.swift
@@ -0,0 +1,45 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import Foundation
+
+final class ResourceMonitor: ObservableObject {
+  @Published var usedMemory = 0
+  @Published var availableMemory = 0
+  private var memoryUpdateTimer: Timer?
+
+  deinit {
+    stop()
+  }
+
+  public func start() {
+    memoryUpdateTimer = Timer.scheduledTimer(withTimeInterval: 0.5, repeats: true) { [weak self] _ in
+      self?.updateMemoryUsage()
+    }
+  }
+
+  public func stop() {
+    memoryUpdateTimer?.invalidate()
+  }
+
+  private func updateMemoryUsage() {
+    usedMemory = usedMemoryInMB()
+    availableMemory = availableMemoryInMB()
+  }
+
+  private func usedMemoryInMB() -> Int {
+    var info = mach_task_basic_info()
+    var count = mach_msg_type_number_t(MemoryLayout<mach_task_basic_info>.size) / 4
+
+    let kerr: kern_return_t = withUnsafeMutablePointer(to: &info) {
+      $0.withMemoryRebound(to: integer_t.self, capacity: Int(count)) {
+        task_info(mach_task_self_, task_flavor_t(MACH_TASK_BASIC_INFO), $0, &count)
+      }
+    }
+    guard kerr == KERN_SUCCESS else { return 0 }
+    return Int(info.resident_size / 0x100000)
+  }
+
+  private func availableMemoryInMB() -> Int {
+    return Int(os_proc_available_memory() / 0x100000)
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/LLaMA-Info.plist b/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/LLaMA-Info.plist
new file mode 100644
index 0000000000..396d9baa74
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/LLaMA-Info.plist
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>FacebookAppID</key>
+	<string>323636127310156</string>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleDisplayName</key>
+	<string>${PRODUCT_NAME}</string>
+	<key>CFBundleExecutable</key>
+	<string>${EXECUTABLE_NAME}</string>
+	<key>CFBundleIdentifier</key>
+	<string>${FB_BUNDLE_ID}</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>${PRODUCT_NAME}</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>${FB_BUNDLE_VERSION_SHORT}</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleVersion</key>
+	<string>${FB_BUNDLE_VERSION}</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UIStatusBarTintParameters</key>
+	<dict>
+		<key>UINavigationBar</key>
+		<dict>
+			<key>Style</key>
+			<string>UIBarStyleDefault</string>
+			<key>Translucent</key>
+			<false/>
+		</dict>
+	</dict>
+  <key>UILaunchScreen</key>
+	<dict/>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+	</array>
+  <key>LSSupportsOpeningDocumentsInPlace</key>
+	<true/>
+  <key>UIFileSharingEnabled</key>
+	<true/>
+</dict>
+</plist>
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAAssets/Assets.xcassets/AppIcon.appiconset/Contents.json b/examples/demo-apps/apple_ios/LLaMA/LLaMAAssets/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 0000000000..f4344003c8
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAAssets/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,14 @@
+{
+  "images" : [
+    {
+      "filename" : "logo.png",
+      "idiom" : "universal",
+      "platform" : "ios",
+      "size" : "1024x1024"
+    }
+  ],
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAAssets/Assets.xcassets/AppIcon.appiconset/logo.png b/examples/demo-apps/apple_ios/LLaMA/LLaMAAssets/Assets.xcassets/AppIcon.appiconset/logo.png
new file mode 100644
index 0000000000..60e3e5174e
Binary files /dev/null and b/examples/demo-apps/apple_ios/LLaMA/LLaMAAssets/Assets.xcassets/AppIcon.appiconset/logo.png differ
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAAssets/Assets.xcassets/Contents.json b/examples/demo-apps/apple_ios/LLaMA/LLaMAAssets/Assets.xcassets/Contents.json
new file mode 100644
index 0000000000..73c00596a7
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAAssets/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAEntitlements/Entitlements-Dev.plist b/examples/demo-apps/apple_ios/LLaMA/LLaMAEntitlements/Entitlements-Dev.plist
new file mode 100644
index 0000000000..2e779e3e4e
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAEntitlements/Entitlements-Dev.plist
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>get-task-allow</key>
+	<true/>
+	<key>keychain-access-groups</key>
+	<array>
+		<string>T84QZS65DQ.platformFamily</string>
+	</array>
+  <key>com.apple.developer.kernel.increased-memory-limit</key>
+	<true/>
+</dict>
+</plist>
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAEntitlements/Entitlements-Master.plist b/examples/demo-apps/apple_ios/LLaMA/LLaMAEntitlements/Entitlements-Master.plist
new file mode 100644
index 0000000000..a4d2cf9818
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAEntitlements/Entitlements-Master.plist
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+  <key>get-task-allow</key>
+  <false/>
+  <key>keychain-access-groups</key>
+  <array>
+    <string>3NW3KR6Q88.platformFamily</string>
+  </array>
+  <key>com.apple.developer.kernel.increased-memory-limit</key>
+	<true/>
+</dict>
+</plist>
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h
new file mode 100644
index 0000000000..c05860253f
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h
@@ -0,0 +1,27 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+FOUNDATION_EXPORT NSErrorDomain const LLaMARunnerErrorDomain;
+
+NS_SWIFT_NAME(Runner)
+@interface LLaMARunner : NSObject
+
+- (instancetype)initWithModelPath:(NSString*)filePath
+                    tokenizerPath:(NSString*)tokenizerPath;
+- (BOOL)isloaded;
+- (BOOL)loadWithError:(NSError**)error;
+- (BOOL)generate:(NSString*)prompt
+       sequenceLength:(NSInteger)seq_len
+    withTokenCallback:(nullable void (^)(NSString*))callback
+                error:(NSError**)error;
+- (void)stop;
+
++ (instancetype)new NS_UNAVAILABLE;
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
new file mode 100644
index 0000000000..9cd7af0d06
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
@@ -0,0 +1,98 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#import "LLaMARunner.h"
+
+#import <ExecuTorch/ExecuTorchLog.h>
+#import <executorch/examples/models/llama2/runner/runner.h>
+
+using namespace ::torch::executor;
+
+NSErrorDomain const LLaMARunnerErrorDomain = @"LLaMARunnerErrorDomain";
+
+@interface LLaMARunner ()<ExecuTorchLogSink>
+@end
+
+@implementation LLaMARunner {
+  std::unique_ptr<Runner> _runner;
+}
+
+- (instancetype)initWithModelPath:(NSString*)modelPath
+                    tokenizerPath:(NSString*)tokenizerPath {
+  self = [super init];
+  if (self) {
+    [ExecuTorchLog.sharedLog addSink:self];
+    _runner = std::make_unique<Runner>(
+        modelPath.UTF8String, tokenizerPath.UTF8String);
+  }
+  return self;
+}
+
+- (void)dealloc {
+  [ExecuTorchLog.sharedLog removeSink:self];
+}
+
+- (BOOL)isloaded {
+  return _runner->is_loaded();
+}
+
+- (BOOL)loadWithError:(NSError**)error {
+  const auto status = _runner->load();
+  if (status != Error::Ok) {
+    if (error) {
+      *error = [NSError errorWithDomain:LLaMARunnerErrorDomain
+                                   code:(NSInteger)status
+                               userInfo:nil];
+    }
+    return NO;
+  }
+  return YES;
+}
+
+- (BOOL)generate:(NSString*)prompt
+       sequenceLength:(NSInteger)seq_len
+    withTokenCallback:(nullable void (^)(NSString*))callback
+                error:(NSError**)error {
+  const auto status = _runner->generate(
+      prompt.UTF8String, seq_len, [callback](const std::string& token) {
+        callback(@(token.c_str()));
+      });
+  if (status != Error::Ok) {
+    if (error) {
+      *error = [NSError errorWithDomain:LLaMARunnerErrorDomain
+                                   code:(NSInteger)status
+                               userInfo:nil];
+      return NO;
+    }
+  }
+  return YES;
+}
+
+- (void)stop {
+  _runner->stop();
+}
+
+#pragma mark - ExecuTorchLogSink
+
+- (void)logWithLevel:(ExecuTorchLogLevel)level
+           timestamp:(NSTimeInterval)timestamp
+            filename:(NSString*)filename
+                line:(NSUInteger)line
+             message:(NSString*)message {
+  NSUInteger totalSeconds = (NSUInteger)timestamp;
+  NSUInteger hours = (totalSeconds / 3600) % 24;
+  NSUInteger minutes = (totalSeconds / 60) % 60;
+  NSUInteger seconds = totalSeconds % 60;
+  NSUInteger microseconds = (timestamp - totalSeconds) * 1000000;
+  NSLog(
+      @"%c %02lu:%02lu:%02lu.%06lu executorch:%s:%zu] %s",
+      (char)level,
+      hours,
+      minutes,
+      seconds,
+      microseconds,
+      filename.UTF8String,
+      line,
+      message.UTF8String);
+}
+
+@end
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/__tests__/RunnerTest.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/__tests__/RunnerTest.swift
new file mode 100644
index 0000000000..a19df8720b
--- /dev/null
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/__tests__/RunnerTest.swift
@@ -0,0 +1,28 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+@testable import LLaMARunner
+
+import XCTest
+
+final class RunnerTest: XCTestCase {
+
+  func test() {
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "xnnpack_dq_llama2", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "flores200sacrebleuspm", ofType: "bin") else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = Runner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var text = ""
+
+    do {
+      try runner.generate("fr hello", sequenceLength: 128) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(["bonjour", "salut", "coucou"].map { $0.lowercased() }.contains { text.lowercased().contains($0) })
+  }
+}
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/TARGETS b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/TARGETS
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/demo-apps/apple_ios/LLaMA/TARGETS b/examples/demo-apps/apple_ios/LLaMA/TARGETS
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/demo-apps/apple_ios/README.md b/examples/demo-apps/apple_ios/README.md
index 542019636f..8c429af74a 100644
--- a/examples/demo-apps/apple_ios/README.md
+++ b/examples/demo-apps/apple_ios/README.md
@@ -68,7 +68,7 @@ the exported model to a specific location where the Demo App will pick them up:
 ```bash
 python3 -m examples.portable.scripts.export --model_name="mv3"
 python3 -m examples.xnnpack.aot_compiler --delegate --model_name="mv3"
-python3 -m examples.apple.coreml.scripts.export_and_delegate --model_name="mv3"
+python3 -m examples.apple.coreml.scripts.export --model_name="mv3"
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3"
 
 mkdir -p examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Resources/Models/MobileNet/
diff --git a/examples/models/llama2/GPTQ.py b/examples/models/llama2/GPTQ.py
new file mode 100644
index 0000000000..9e29758812
--- /dev/null
+++ b/examples/models/llama2/GPTQ.py
@@ -0,0 +1,579 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+
+import torch.fx as fx
+import torch.nn as nn
+import torch.nn.functional as F
+from model import Transformer  # pyre-ignore[21]
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+aten = torch.ops.aten
+
+## generate.py ##
+
+
+def encode_tokens(tokenizer, string, bos=True, device="cuda"):
+
+    tokens = tokenizer.encode(string)
+    if bos:
+        tokens = [tokenizer.bos_id()] + tokens
+    return torch.tensor(tokens, dtype=torch.int, device=device)
+
+
+def model_forward(model, x, input_pos):
+    return model(x, input_pos)
+
+
+## eval.py ##
+
+try:
+    import lm_eval  # pyre-ignore[21]  # noqa: F401
+
+    lm_eval_available = True
+except:
+    lm_eval_available = False
+
+if lm_eval_available:
+    try:  # lm_eval version 0.4
+        from lm_eval.evaluator import evaluate  # pyre-ignore[21]
+        from lm_eval.models.huggingface import HFLM as eval_wrapper  # pyre-ignore[21]
+        from lm_eval.tasks import get_task_dict  # pyre-ignore[21]
+    except:  # lm_eval version 0.3
+        from lm_eval import base, evaluator, tasks
+
+        eval_wrapper = base.BaseLM
+        get_task_dict = tasks.get_task_dict
+        evaluate = evaluator.evaluate
+
+
+def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
+    model: Transformer,  # pyre-ignore[11]
+    prompt: torch.Tensor,
+    max_new_tokens: int,
+    max_seq_length: Optional[int] = None,
+    block_size: int = 2048,
+):
+    """
+    Sets up model cache and does some bookkeeping calculations for prompt, input_pos and max_seq_length
+    that are needed for prefill or model_forward
+
+    Args:
+        model (torch.nn.Module): The model whose cache gets set up
+        prompt (torch.Tensor): Tensor of shape (T) with indices of the prompt sequence.
+        max_new_tokens (int): The desired maximum number of new tokens that can be generated.
+        max_seq_length (Optional[int], optional): The maximum sequence length allowed.
+
+    Returns:
+        seq (torch.Tensor): prompt but padded with zeros to size max_seq_length
+        input_pos (torch.Tensor): tensor of integers in increasing order
+        max_seq_length (int): The maximum sequence length allowed, updated based on other numbers
+    """
+    T = prompt.size(0)
+    T_new = T + max_new_tokens
+    if max_seq_length is None:
+        max_seq_length = min(T_new, block_size)
+
+    device, dtype = prompt.device, prompt.dtype
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    empty = torch.empty(T_new, dtype=dtype, device=device)
+    empty[:T] = prompt
+    seq = empty
+    input_pos = torch.arange(0, T, device=device)
+
+    # no caches in executorch llama2 7b model?
+    # with torch.device(device):
+    #     model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
+
+    return seq, input_pos, max_seq_length
+
+
+class GPTFastEvalWrapper(eval_wrapper):  # pyre-ignore[11]
+    """
+    A wrapper class for GPTFast, providing integration with the lm-evaluation-harness library.
+    """
+
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        tokenizer,
+        max_seq_length: Optional[int] = None,
+    ):
+        super().__init__()
+        self._model = model
+        self._tokenizer = tokenizer
+        self._device = torch.device("cuda")
+        self._max_seq_length = 2048 if max_seq_length is None else max_seq_length
+
+    @property
+    def eot_token_id(self):
+        return self._tokenizer.eos_id()
+
+    @property
+    def max_length(self):
+        return self._max_seq_length
+
+    @property
+    def max_gen_toks(self):
+        return 50
+
+    @property
+    def batch_size(self):
+        return 1
+
+    @property
+    def device(self):
+        return self._device
+
+    def tok_encode(self, string: str, **kwargs):
+        encoded = encode_tokens(self._tokenizer, string, bos=True, device=self._device)
+        # encoded is a pytorch tensor, but some internal logic in the
+        # eval harness expects it to be a list instead
+        # TODO: verify this for multi-batch as well
+        encoded = encoded.tolist()
+        return encoded
+
+    def tok_decode(self, tokens):
+        decoded = self._tokenizer.decode(tokens)
+        return decoded
+
+    def _model_call(self, inps):
+        # TODO: make batches work
+        inps = inps.squeeze(0)
+
+        max_new_tokens = 1
+        seq, input_pos, max_seq_length = (
+            setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
+                self._model,
+                inps,
+                max_new_tokens,
+                self.max_length,
+            )
+        )
+        x = seq.index_select(0, input_pos).view(1, -1)
+        logits = model_forward(self._model, x, input_pos)
+        return logits
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
+
+
+class InputRecorder(GPTFastEvalWrapper):
+    """
+    This is a fake evaluation wrapper that just records the inputs
+    so that they can be used in calibration.
+
+    If pad_calibration_inputs is enabled, the input recorder will take
+    each input and pad/truncate it down to the calibration_seq_length.
+    It will also edit the model embeddings to be zero for the 0 token used
+    in padding and avoid any inputs with the 0 token.
+
+    If not, it will only truncate inputs to the desired length.
+    """
+
+    def __init__(
+        self,
+        model: Transformer,
+        tokenizer,
+        calibration_seq_length,
+        pad_calibration_inputs=False,
+    ):
+        super().__init__(model, tokenizer, calibration_seq_length)
+        self._model = model
+        self._tokenizer = tokenizer
+        self._device = torch.device("cpu")
+        self.vocab_size = model.vocab_size
+        self.calibration_seq_length = calibration_seq_length
+        self.pad_calibration_inputs = pad_calibration_inputs
+        self.inputs = None
+
+        if self.pad_calibration_inputs:
+            # This is needed for the pad_calibration_inputs option
+            # to work properly, the 0 token's embeddings are set to 0 so that
+            # the padded inputs will not affect the model numerics. This token isn't used
+            # commonly in the eval tasks for the meta-llama tokenizer and we skip any inputs
+            # where it appears
+            try:
+                if isinstance(self._model.transformer.wte, nn.Embedding):
+                    self._model.transformer.wte.weight.data[0, :] *= 0
+            except:
+                print(
+                    "Did not find embeddings in model.transformer.wte, disabling padding"
+                )
+                self.pad_calibration_inputs = False
+
+    def add_input(self, args):
+        if self.inputs is None:
+            self.inputs = [MultiInput([arg]) for arg in args]
+        else:
+            self.inputs = [
+                multi.add_input(arg) for (multi, arg) in zip(self.inputs, args)
+            ]
+
+    def get_recorded_inputs(self):
+        return self.inputs
+
+    def _model_call(self, inps):
+        inps = inps.squeeze(0)
+        T = len(inps)
+        if (
+            # can't use inputs that are too short when padding disabled
+            (T < self.calibration_seq_length and not self.pad_calibration_inputs)
+            or
+            # can't use inputs that actually use token we use for padding
+            (self.pad_calibration_inputs and 0 in inps)
+        ):
+            # give random output
+            return torch.randn(
+                (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+            )
+
+        # pad or truncate to the right size
+        if T >= self.calibration_seq_length:
+            inps = inps[: self.calibration_seq_length]
+        else:
+            inps = F.pad(inps, (0, self.calibration_seq_length - T))
+
+        max_new_tokens = 1
+        (
+            seq,
+            input_pos,
+            max_seq_length,
+        ) = setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
+            self._model, inps, max_new_tokens, self.max_length
+        )
+        x = seq.index_select(0, input_pos).view(1, -1)
+        self.add_input((x, input_pos))
+
+        # output `something` with correct shape to keep eval going
+        return torch.randn(
+            (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+        )
+
+
+class MultiInput:
+    def __init__(self, inputs):
+        self.values = list(inputs)
+
+    def add_input(self, input):
+        self.values.append(input)
+        return self
+
+    def __getitem__(self, slice):
+        return MultiInput(self.values[slice])
+
+    def cuda(self):
+        self.values = [
+            val.cuda() if isinstance(val, torch.Tensor) else val for val in self.values
+        ]
+
+
+class GenericGPTQRunner(fx.Interpreter):
+    """
+    This is a generic GPTQ runner that takes an existing model and applies GPTQ.
+    It uses torch._dynamo.export to obtain a graph of the model and then hooks
+    into function calls and when it detects a linear, it applies GPTQ to the weight
+    given the calibration of inputs passed in at initialization. It puts the results
+    into the state_dict so that the quantized model weights/qparams can be loaded
+    directly into the model.
+
+    This class is expected to work in concert with a GPTQSimpleQuantizer
+    class to define the specific type of quantization being done.
+    """
+
+    def __init__(
+        self, model, inputs: MultiInput, blocksize=128, percdamp=0.01, groupsize=128
+    ):
+        self.id_to_name = {
+            id(value): name for name, value in dict(model.named_parameters()).items()
+        }
+
+        # trace model for one input
+        one_input = [multi.values[0] for multi in inputs]  # pyre-ignore[16]
+        exported_model = torch._dynamo.export(
+            model, aten_graph=True, pre_dispatch=True, tracing_mode="fake"
+        )(*one_input)
+        super().__init__(exported_model.graph_module)
+        self.new_state_dict = model.state_dict()
+        self.blocksize = blocksize
+        self.percdamp = percdamp
+        self.groupsize = groupsize
+        self.inputs = inputs
+        self.gptq_done = False
+        self.debug = False
+
+    def configure_quantization_mode(
+        self,
+        get_qparams_func,
+        quantize_func,
+        dequantize_func,
+        combine_qparams_list_func,
+        make_names_and_values_dict_func,
+        skip_layer_func,
+    ):
+        # these functions need to already be curried with all inputs other than weight, qparams
+        self.get_qparams_func = (
+            get_qparams_func  # accepts [2d weight tensor], outputs qparams.
+        )
+
+        self.quantize_func = quantize_func  # accepts [2d weight tensor], [qparams], outputs a 2d quantized tensor of desired dtype
+
+        self.dequantize_func = dequantize_func
+        # accepts [quantized] tensor and [qparams], outputs a 2d dequantized tensor of type float,
+        # assumes this output .to(w_orig_dtype) is ~eventual desired dequant behavior
+
+        self.combine_qparams_list_func = combine_qparams_list_func
+        # accepts [`list` of qparams] from quantizing one group at a time,
+        # outputs a qparams object that could be passed into quant/dequantize_func
+
+        self.skip_layer_func = skip_layer_func  # accepts [weight tensor], outputs a bool on whether or not to apply gptq to this layer
+
+        self.make_names_and_values_dict_func = make_names_and_values_dict_func  # accepts [2d quantized tensor], [qparams], returns a dict of names, values to put in state_dict
+        # note any final packing for storage should happen here
+        return self
+
+    def run(self):
+        assert (
+            self.get_qparams_func is not None
+        ), "need to configure quantization mode before running"
+        self.gptq_done = True
+        super().run(*self.inputs)
+
+    def get_quantized_state_dict(self):
+        assert (
+            self.gptq_done
+        ), "need to run GPTQRunner before you can get_quantized_state_dict"
+        quantized_state_dict = self.new_state_dict
+        # Don't want to store/load the kv_cache so remove it from the state_dict
+        del_list = []
+        for param_fqn in quantized_state_dict:
+            if "kv_cache" in param_fqn:
+                del_list.append(param_fqn)
+        for param_fqn in del_list:
+            quantized_state_dict.pop(param_fqn)
+        return quantized_state_dict
+
+    def call_function(self, target, args, kwargs, skip_quant=False):  # noqa: C901
+        def tensors_to_cuda(args):
+            new_args = []
+            for x in args:
+                new_args.append(x.cuda() if isinstance(x, torch.Tensor) else x)
+            return new_args
+
+        # flatten args and kwargs together
+        flat_args, spec = tree_flatten((args, kwargs))
+        # move all single tensors to cuda, will move MultiInputs to cuda one at a time
+        flat_args = tensors_to_cuda(flat_args)
+
+        has_multi_input = MultiInput in [type(x) for x in flat_args]
+        if has_multi_input:
+            # Just some trickery to convert
+            # [MultiInput[a, a, a], MultiInput(b, b, b)] => [a, b], [a, b], [a, b]
+            multi_input_count = max(
+                [len(x.values) if isinstance(x, MultiInput) else 1 for x in flat_args]
+            )
+            transposed_args = list(
+                zip(
+                    *[
+                        (
+                            x.values
+                            if isinstance(x, MultiInput)
+                            else [x] * multi_input_count
+                        )
+                        for x in flat_args
+                    ]
+                )
+            )
+        else:
+            transposed_args = [flat_args]
+        outputs = []
+
+        # check whether we apply GPTQ to this module
+        quantize_linear = (
+            (target == aten.linear.default)  # if its a linear
+            and id(args[1]) in self.id_to_name  # and if we know the layer name
+            and not skip_quant  # and if we weren't told to skip quantization
+            # and if the skip_layer_func doesn't say we should skip
+            and not (self.skip_layer_func is not None and self.skip_layer_func(args[1]))
+        )  # then we will quantize this linear layer/weight
+
+        if quantize_linear:  # instantiate variables for GPTQ
+            H = 0
+            total_batches = 0
+
+        for inp in transposed_args:
+            inp = tensors_to_cuda(inp)
+            cur_args, cur_kwargs = tree_unflatten(inp, spec)
+
+            if (
+                quantize_linear
+            ):  # calculate H instead of output (will run the linear eventually with updated weight)
+                x = cur_args[0].float()
+                shape = x.shape
+                n = 1 if len(shape) == 2 else shape[0]
+                H *= total_batches / (total_batches + n)
+                total_batches += n
+                x = ((2 / total_batches) ** (1 / 2)) * x.reshape(
+                    -1, shape[-1]
+                ).t().float()
+                H += x.matmul(x.t())
+            else:
+                # get output if its not a linear
+                out = super().call_function(target, cur_args, cur_kwargs)
+
+                if isinstance(out, torch.Tensor):
+                    outputs.append(out.cpu())
+                else:
+                    outputs.append(out)
+
+        if quantize_linear:
+            mod_fqn = ".".join(self.id_to_name[id(args[1])].split(".")[:-1])
+            W = args[1].to(H.device)
+            Q, DQ, qparams = self.faster_quant(H, W.detach())
+            print(mod_fqn)
+            names_and_values_dict = self.make_names_and_values_dict_func(Q, qparams)
+
+            # delete old weight
+            if mod_fqn + ".weight" in self.new_state_dict:
+                self.new_state_dict.pop(mod_fqn + ".weight")
+            if len(args) > 2:
+                self.new_state_dict[mod_fqn + ".bias"] = args[2]
+            for name, value in names_and_values_dict.items():
+                self.new_state_dict[mod_fqn + "." + name] = value
+
+            # run linear with new weight to get corrected output
+            new_out = self.call_function(
+                target, (args[0], DQ, *args[2:]), kwargs, skip_quant=True
+            )
+
+            if self.debug:
+                old_out = self.call_function(
+                    target, (args[0][:2], args[1], *args[2:]), kwargs, skip_quant=True
+                )
+
+                def SQNR(x, y):
+                    # TODO: Use of deprecated function torch.norm
+                    return 20 * torch.log10(
+                        torch.linalg.norm(x) / torch.linalg.norm(x - y)
+                    )
+
+                DQ_after = self.dequantize_func(Q, qparams).to(W.dtype)
+                print(
+                    "SQNR for QDQ (this should be inf)", SQNR(DQ, DQ_after)
+                )  # matches
+
+                print(
+                    "SQNR for weight (can be low)", SQNR(W, DQ.cuda())
+                )  # fine to not match
+                print(
+                    "SQNR for output with GPTQ (hopefully 35+)",
+                    torch.cat(
+                        [
+                            SQNR(old.cpu(), new.cpu()).unsqueeze(0)
+                            for (old, new) in zip(old_out.values, new_out.values[:2])
+                        ]
+                    ).mean(),
+                )
+
+                qparams2 = self.get_qparams_func(W)
+                Q2 = self.quantize_func(W, qparams2)
+                DQ2 = self.dequantize_func(Q2, qparams2).to(W.dtype)
+                old_q_out = self.call_function(
+                    target, (args[0][:2], DQ2, *args[2:]), kwargs, skip_quant=True
+                )
+
+                print(
+                    "SQNR for output without GPTQ (should be less than above)",
+                    torch.cat(
+                        [
+                            SQNR(old.cpu(), old_q.cpu()).unsqueeze(0)
+                            for (old, old_q) in zip(old_out.values, old_q_out.values)
+                        ]
+                    ).mean(),
+                )
+            return new_out
+
+        return MultiInput(outputs) if has_multi_input else outputs[0]
+
+    def faster_quant(self, H, W):
+        percdamp = self.percdamp
+        blocksize = self.blocksize
+        groupsize = self.groupsize
+        orig_dtype = W.dtype
+        W = W.detach().float()
+        _, columns = W.shape[0], W.shape[1]
+        device = W.device
+
+        if groupsize == -1:
+            cur_qparams = self.get_qparams_func(W)
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+
+        Losses = torch.zeros_like(W)
+        DQ = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(columns, device=device)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+
+        all_qparams = []
+        for i1 in range(0, columns, blocksize):
+            i2 = min(i1 + blocksize, columns)
+            count = i2 - i1
+            W1 = W[:, i1:i2].clone()
+            DQ1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if groupsize != -1 and (i1 + i) % groupsize == 0:  # start of new group
+                    cur_qparams = self.get_qparams_func(
+                        W[:, (i1 + i) : (i1 + i + groupsize)]
+                    )
+                    all_qparams.append(cur_qparams)
+
+                q = self.quantize_func(w.unsqueeze(1), cur_qparams).flatten()
+                dq = self.dequantize_func(q.unsqueeze(1), cur_qparams).flatten()
+
+                DQ1[:, i] = dq
+                Losses1[:, i] = (w - dq) ** 2 / d**2
+
+                err1 = (w - dq) / d
+                W1[:, i:] -= (
+                    err1.to(Hinv1.dtype).unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                )
+                Err1[:, i] = err1
+
+            DQ[:, i1:i2] = DQ1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.to(Hinv.dtype).matmul(Hinv[i1:i2, i2:])
+
+        torch.cuda.synchronize()
+
+        if all_qparams == []:
+            all_qparams.append(cur_qparams)
+
+        # convert a list of qparams objects into a single one. enerally by
+        # concatenating a bunch of n,1 scale/zeros tensors into a n,num_groups tensor
+        all_qparams = self.combine_qparams_list_func(all_qparams)
+        Q = self.quantize_func(DQ, all_qparams)
+        return Q, DQ.to(orig_dtype), all_qparams
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
index b967a9faaf..8d3c8c24d3 100644
--- a/examples/models/llama2/TARGETS
+++ b/examples/models/llama2/TARGETS
@@ -64,9 +64,12 @@ runtime.python_binary(
 runtime.python_library(
     name = "export_library",
     srcs = [
+        "GPTQ.py",
         "builder.py",
         "export_llama.py",
         "export_llama_lib.py",
+        "model.py",
+        "quantize.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama2",
@@ -79,10 +82,12 @@ runtime.python_library(
         "//executorch/backends/transforms:duplicate_dynamic_quant_chain",
         "//executorch/backends/xnnpack:xnnpack_backend",
         "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
+        "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
         "//executorch/examples/portable:utils",
         "//executorch/exir:lib",
+        "//executorch/sdk/etrecord:etrecord",
         "//executorch/util:memory_profiler",
         "//executorch/util:python_profiler",
         # one definition has to be included in the user of the libarary
@@ -90,5 +95,6 @@ runtime.python_library(
         # "//executorch/extension/pybindings:aten_lib",
         # "//executorch/extension/pybindings:portable_lib",
         # "//executorch/extension/pybindings:portable_lib_plus_custom",
+        "fbsource//third-party/pypi/sentencepiece:sentencepiece",
     ],
 )
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py
index f0c7cca68b..988046218b 100644
--- a/examples/models/llama2/builder.py
+++ b/examples/models/llama2/builder.py
@@ -260,6 +260,8 @@ def export_to_edge(
         edge_config = self._get_edge_config()
         metadata = self._get_metadata()
 
+        # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
+        # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
         with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
             m = capture_pre_autograd_graph(
                 self.model, self.example_inputs, dynamic_shapes=dynamic_shape
@@ -343,4 +345,4 @@ def save_to_pte(self, output_name: str) -> None:
             output_name (Optional[str]): The name of the .pte file.
         """
         assert output_name, "Need a valid output name"
-        save_pte_program(self.export_program.buffer, output_name, self.output_dir)
+        save_pte_program(self.export_program, output_name, self.output_dir)
diff --git a/examples/models/llama2/custom_ops/op_sdpa.cpp b/examples/models/llama2/custom_ops/op_sdpa.cpp
index 40b80bf5d0..16732b7153 100644
--- a/examples/models/llama2/custom_ops/op_sdpa.cpp
+++ b/examples/models/llama2/custom_ops/op_sdpa.cpp
@@ -177,7 +177,8 @@ inline void fill_stub(scalar_t* data, scalar_t val, int64_t size) {
   for (; d < size - (size % Vec::size()); d += Vec::size()) {
     data_vec.store(data + d);
   }
-#if !defined(_MSC_VER) && !defined(COMPILING_FOR_MIN_SIZE)
+#if !defined(_MSC_VER) && !defined(COMPILING_FOR_MIN_SIZE) && \
+    !defined(__ANDROID__)
 #pragma unroll
 #endif
   for (; d < size; d++) {
@@ -249,7 +250,7 @@ void cpu_flash_attention(
     ET_CHECK_MSG(
         attn_mask.value().size(1) == kvSize,
         "attn_mask shape mismatch"
-        "attn_mask.size(1)=%ld kvSize=%" PRId64,
+        "attn_mask.size(1)=%zd kvSize=%" PRId64,
         attn_mask.value().size(1),
         kvSize);
   }
@@ -578,7 +579,7 @@ bool validate_cache_params(
       "start_post + seq_length must be less than max seq length supported by key cache."
       "start pos: %" PRId64 ", seq_length: %" PRId64
       "."
-      "key cache size: %ld",
+      "key cache size: %zd",
       start_pos,
       seq_length,
       k_cache.size(2));
@@ -588,7 +589,7 @@ bool validate_cache_params(
       "start_post + seq_length must be less than max seq length supported by key cache."
       "start pos: %" PRId64 ", seq_length: %" PRId64
       "."
-      "value cache size: %ld",
+      "value cache size: %zd",
       start_pos,
       seq_length,
       v_cache.size(2));
@@ -659,13 +660,13 @@ Tensor& flash_attention_kernel_out(
       ctx,
       validate_flash_attention_args(query, key, value, attn_mask),
       InvalidArgument,
-      false);
+      output);
 
   ET_KERNEL_CHECK(
       ctx,
       resize_tensor(output, query.sizes()) == Error::Ok,
       InvalidArgument,
-      false);
+      output);
 
   auto q_seq_len = query.size(2);
 
@@ -749,7 +750,7 @@ Tensor& sdpa_with_kv_cache_out(
       validate_cache_params(
           key_cache, value_cache, layer_id, start_pos, seq_len),
       InvalidArgument,
-      false);
+      output);
 
   ET_CHECK_MSG(q_projected.dim() == 4, "query must be a 4D tensor");
 
@@ -820,7 +821,7 @@ Tensor& sdpa_with_kv_cache_out(
       ctx,
       resize_tensor(output, q_projected.sizes()) == Error::Ok,
       InvalidArgument,
-      false);
+      output);
 
   // TODO(task): replace the template param selection logic
   // with whatever apprpriately makes more sense for
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index 1ab2bb3bd9..cf8e38fe52 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -7,6 +7,7 @@
 # Example script for exporting Llama2 to flatbuffer
 
 import argparse
+import copy
 import logging
 import shlex
 from dataclasses import dataclass
@@ -17,11 +18,14 @@
 
 import pkg_resources
 import torch
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
     XnnpackDynamicallyQuantizedPartitioner,
-    XnnpackPartitioner,
 )
+
+# from executorch.sdk.etrecord import generate_etrecord
 from executorch.util.activation_memory_profiler import generate_memory_trace
+from sentencepiece import SentencePieceProcessor
 from torch.ao.quantization.quantizer import Quantizer
 from torch.ao.quantization.quantizer.embedding_quantizer import EmbeddingQuantizer
 from torch.ao.quantization.quantizer.xnnpack_quantizer import (
@@ -33,10 +37,12 @@
 
 from .quantize import (
     EmbeddingOnlyInt8QuantHandler,
+    Int8DynActInt4WeightGPTQQuantHandler,
     Int8DynActInt4WeightQuantHandler,
     WeightOnlyInt8QuantHandler,
 )
 
+
 IS_FBCODE = True  #  os.environ.get("FBCODE_PLATFORM", False)
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -172,6 +178,16 @@ def quantize(
     model: torch.nn.Module,
     qmode: str,
     activation_dtype: Optional[DType],
+    checkpoint_path: Optional[Path] = None,
+    # following arguments only available when setting int4 quantization.
+    groupsize: int = 128,
+    # following arguments only used for GPTQ
+    calibration_tasks: Optional[list] = None,
+    calibration_limit: int = 1000,
+    calibration_seq_length: int = 100,
+    pad_calibration_inputs: bool = False,
+    percdamp: float = 0.01,
+    blocksize: int = 128,
 ) -> torch.nn.Module:
     """
     Quantizes a model by converting all weights to int8.
@@ -186,15 +202,44 @@ def quantize(
     else:
         torch_dtype = torch.float16
 
+    if checkpoint_path is None:
+        checkpoint_path = Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+
+    if calibration_tasks is None:
+        calibration_tasks = ["hellaswag"]
+
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
         return WeightOnlyInt8QuantHandler(model).quantized_model()
     elif qmode == "int4":
         model_int4 = Int8DynActInt4WeightQuantHandler(
-            model, activation_precision=torch_dtype
+            model,
+            precision=torch_dtype,
         ).quantized_model()
         print("quantized model:", model_int4)
         return model_int4
+    elif qmode == "8da4w-gptq":
+        gptq_quant_handler = Int8DynActInt4WeightGPTQQuantHandler(
+            precision=torch_dtype,
+        )
+        tokenizer_path = checkpoint_path.parent / "tokenizer.model"
+        assert tokenizer_path.is_file(), tokenizer_path
+        tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
+            model_file=str(tokenizer_path)
+        )
+        model_updated_state_dict = gptq_quant_handler.create_quantized_state_dict(
+            tokenizer,
+            blocksize,
+            percdamp,
+            groupsize,
+            calibration_tasks,
+            calibration_limit,
+            calibration_seq_length,
+            pad_calibration_inputs,
+        )
+        model = gptq_quant_handler.convert_for_runtime(model)
+        model.load_state_dict(model_updated_state_dict)
+        return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")
 
@@ -242,7 +287,7 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--quantization_mode",
         type=str,
         default=None,
-        choices=["int8", "int4"],
+        choices=["int8", "int4", "8da4w-gptq"],
         help="type of quantization",
     )
 
@@ -304,9 +349,26 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=None,
         help="Override the dtype of the model (default is the checkpoint dtype). Options: fp16, fp32",
     )
+
+    parser.add_argument(
+        "-n",
+        "--output_name",
+        default=None,
+        help="Override the output filename of the saved pte model file.",
+    )
+
     parser.add_argument("-2", "--fairseq2", action="store_true")
     parser.add_argument("-v", "--verbose", action="store_true")
     parser.add_argument("-X", "--xnnpack", action="store_true")
+    parser.add_argument("-V", "--vulkan", action="store_true")
+
+    parser.add_argument(
+        "--generate_etrecord",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Generate the ETRecord debug artifact.",
+    )
 
     return parser
 
@@ -393,16 +455,28 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
         modelname = f"xnnpack_dq_{modelname}"
 
     if args.xnnpack:
-        partitioners[XnnpackPartitioner.__name__] = XnnpackPartitioner()
+        # Following changes due to.
+        # 1. We need dynamically quantized partitioner for both pt2e_quantize options
+        #    as well as "qmode int4" which is also dynamic quantizes linear layers.
+        # 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops.
+        partitioners[XnnpackDynamicallyQuantizedPartitioner.__name__] = (
+            XnnpackDynamicallyQuantizedPartitioner()
+        )
+        # partitioners[XnnpackPartitioner.__name__] = XnnpackPartitioner()
         modelname = f"xnnpack_{modelname}"
 
-    # TODO: remove this after xnnpack delegation is ready
-    if args.quantization_mode == "int4":
-        raise Exception(
-            "some quantized ops should be lowered to xnnpack, but xnnpack delegate is not ready yet"
-        )
+    if args.vulkan:
+        assert (
+            args.dtype_override is None
+        ), "Vulkan backend does not support non fp32 dtypes at the moment"
+        assert (
+            args.quantization_mode is None
+        ), "Vulkan backend does not support quantization at the moment"
+
+        partitioners[VulkanPartitioner.__name__] = VulkanPartitioner()
+        modelname = f"vulkan_{modelname}"
 
-    builder = (
+    builder_exported_to_edge = (
         load_llama_model(
             checkpoint=checkpoint_path,
             params_path=params_path,
@@ -416,17 +490,49 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
         .source_transform(transforms)
         .to_dtype(dtype_override)
         .export_to_edge(quantizers)
-        .to_backend(partitioners)
-        .to_executorch()
     )
 
+    if args.generate_etrecord:
+        if not builder_exported_to_edge.edge_manager:
+            raise ValueError("Unable to generate etrecord due to missing edge manager.")
+
+        # logging.info("Generating etrecord")
+        # Copy the edge manager which will be serialized into etrecord. This is memory-wise expensive.
+        edge_manager_copy = copy.deepcopy(builder_exported_to_edge.edge_manager)
+        builder = builder_exported_to_edge.to_backend(partitioners).to_executorch()
+
+        # Generate ETRecord
+        if edge_manager_copy:
+            # generate_etrecord(
+            #     etrecord_path="etrecord.bin",
+            #     edge_dialect_program=edge_manager_copy,
+            #     executorch_program=builder.export_program,
+            # )
+            # logging.info("Generated etrecord.bin")
+            pass
+    else:
+        builder = builder_exported_to_edge.to_backend(partitioners).to_executorch()
+
     if args.profile_memory:
         generate_memory_trace(builder.export_program, "memory_profile.json")
 
     if builder.dtype == DType.fp16:
         modelname = f"{modelname}_h"
 
-    builder.save_to_pte(modelname)
-    output_file = f"{builder.output_dir}/{modelname}.pte"
+    if args.output_name:
+        modelname = args.output_name
+        if modelname.endswith(".pte"):
+            output_file = modelname
+            modelname = modelname[:-4]
+            print(f"modelname: {modelname}")
+            print(f"output_file: {output_file}")
+        else:
+            output_file = f"{builder.output_dir}/{modelname}.pte"
+            print(f"modelname: {modelname}")
+            print(f"output_file: {output_file}")
+    else:
+        output_file = f"{builder.output_dir}/{modelname}.pte"
+
+    builder.save_to_pte(output_file)
 
     return output_file
diff --git a/examples/models/llama2/install_requirement_helper.py b/examples/models/llama2/install_requirement_helper.py
new file mode 100644
index 0000000000..3d3254a13a
--- /dev/null
+++ b/examples/models/llama2/install_requirement_helper.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This removes the imported [examples] module from pip installing lm_eval due to
+colliding module name with ET package
+"""
+
+try:
+    # If the import fails, this means there's nothing to remove
+    import examples
+
+    try:
+        # If the import succeeds, this means it isn't using lm_eval's module
+        import examples.models
+    except:
+        print(
+            "Failed to import examples.models due to lm_eval conflict. Removing lm_eval examples module"
+        )
+        import shutil
+
+        examples_path = examples.__path__[0]
+        shutil.rmtree(examples_path)
+
+except:
+    pass
diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama2/install_requirements.sh
index b16ba1b66e..c7a7f41ffb 100644
--- a/examples/models/llama2/install_requirements.sh
+++ b/examples/models/llama2/install_requirements.sh
@@ -8,3 +8,15 @@
 # Install snakeviz for cProfile flamegraph
 # Install sentencepiece for llama tokenizer
 pip install snakeviz sentencepiece
+pip install torchao-nightly
+
+# Install datasets for HuggingFace dataloader
+# v2.14.0 is intentional to force lm-eval v0.3.0 compatibility
+pip install datasets==2.14.0
+
+# Install lm-eval for Model Evaluation with lm-evalution-harness
+# v0.3.0 is intentional
+pip install lm-eval==0.3.
+
+# Call the install helper for further setup
+python examples/models/llama2/install_requirement_helper.py
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
index 1e4a7386e7..5fe6f80b4a 100644
--- a/examples/models/llama2/llama_transformer.py
+++ b/examples/models/llama2/llama_transformer.py
@@ -110,7 +110,9 @@ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
 
 
 def precompute_freqs_cis(dim: int, end: int, theta: float):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2, device="cpu")[: (dim // 2)].float() / dim)
+    )
     t = torch.arange(end, device=freqs.device)  # pyre-ignore
     freqs = torch.outer(t, freqs).float()  # pyre-ignore
     freqs_cos = torch.cos(freqs)
@@ -171,6 +173,7 @@ def __init__(self, args: ModelArgs, layer_id: int):
         mask = torch.full(
             (1, 1, args.max_seq_len, args.max_seq_len),
             float("-inf"),
+            device="cpu",
         )
 
         mask = torch.triu(mask, diagonal=1)
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index c22ae8ad35..5b25a362b7 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -70,7 +70,7 @@ def __init__(self, **kwargs):
         # Follow the instruction in https://github.com/facebookresearch/llama to download the model
         device = "cpu"
         # flake8: noqa: TOR102
-        checkpoint = torch.load(checkpoint_path, map_location=device)
+        checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
         fairseq2_checkpoint = kwargs.get("fairseq2", False)
         if fairseq2_checkpoint:
             print("Using fairseq2 checkpoint")
@@ -130,7 +130,11 @@ def __init__(self, **kwargs):
             for key, weights in checkpoint.items():
                 print(f"{key} : {weights.numel()} : {weights.size()}")
             print("============= /weights ================")
-        self.model_ = Transformer(model_args)
+
+        # Within the device="meta" context, tensors that are created do not carry data.
+        # They possess all other metadata a tensor carries such as size, stride, requires_grad.
+        with torch.device("meta"):
+            self.model_ = Transformer(model_args)
 
         if "int8" in str(checkpoint_path):
             print("Using int8 weight-only quantization!")
@@ -142,11 +146,16 @@ def __init__(self, **kwargs):
             print("Using int4 weight-only quantization!")
             from .quantize import Int8DynActInt4WeightQuantHandler
 
-            simple_quantizer = INt8dynactint4weightquanthandler(self.model_)
+            simple_quantizer = Int8DynActInt4WeightQuantHandler(self.model_)
             self.model_ = simple_quantizer.convert_for_runtime()
 
+        # assign=True: load params/buffers by assignment instead of performing an in-place copy.
+        # Because we are using device="meta", tensors do not have memory associated with them
+        # and an in-place copy is a no-op. Use assign=True in load_state_dict for this scenario.
         self.model_.load_state_dict(
-            checkpoint, strict=False
+            checkpoint,
+            strict=False,
+            assign=True,
         )  # self.model_ = Transformer(gptconf)
 
     def get_eager_model(self):
diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml
index bc3e857665..8e435169e1 100644
--- a/examples/models/llama2/ops/quantized.yaml
+++ b/examples/models/llama2/ops/quantized.yaml
@@ -4,14 +4,8 @@
     - arg_meta: null
       kernel_name: torch::executor::quantized_embedding_byte_out
 
-- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::quantized_embedding_byte_dtype_out
-
-- func: quantized_decomposed::mixed_linear.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::quantized_mixed_linear_out
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py
index 4c49a9de75..8c02700e4c 100644
--- a/examples/models/llama2/quantize.py
+++ b/examples/models/llama2/quantize.py
@@ -76,7 +76,7 @@ def dynamically_quantize_per_channel(
 
     if group_size is None or group_size == 0:
         items = x_shape_1
-    elif not enable_non_multiple_groups:
+    elif ((x_shape_1 % group_size) == 0) or not enable_non_multiple_groups:
         assert group_size > 0, "group size must be positive"
         assert (
             x_shape_1 % group_size
@@ -376,7 +376,7 @@ def dequantize_per_token_meta(
     return torch.empty_like(input, dtype=output_dtype)
 
 
-def get_group_qparams_symmetric(w, n_bit=4, groupsize=128, precision=torch.float16):
+def get_group_qparams_symmetric(w, n_bit=4, groupsize=128, precision=torch.float32):
     # needed for GPTQ with padding
     if groupsize > w.shape[-1]:
         groupsize = w.shape[-1]
@@ -422,13 +422,6 @@ def pack_scales_and_zeros(scales, zeros, precision=torch.float16):
     )
 
 
-def unpack_scales_and_zeros(scales_and_zeros):
-    assert len(scales_and_zeros.shape) == 3 and scales_and_zeros.shape[2] == 2
-    # why is this float?
-    # assert scales_and_zeros.dtype == torch.float
-    return torch.split(scales_and_zeros.transpose(0, 1), 1, 2)
-
-
 quantized_decomposed_lib.define(
     "quantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, "
     "int quant_max, ScalarType dtype, int group_size) -> Tensor"
@@ -513,8 +506,10 @@ def quantize_per_channel_group_meta(
     return torch.empty_like(input, dtype=dtype)
 
 
-def group_quantize_tensor_symmetric(w, n_bit=4, group_size=128):
-    scales, zeros = get_group_qparams_symmetric(w, n_bit, group_size)
+def group_quantize_tensor_symmetric(
+    w, n_bit=4, group_size=128, precision=torch.float32
+):
+    scales, zeros = get_group_qparams_symmetric(w, n_bit, group_size, precision)
     n_bit = 4
     max_int = 2 ** (n_bit - 1) - 1
     min_int = -(2 ** (n_bit - 1))
@@ -524,8 +519,7 @@ def group_quantize_tensor_symmetric(w, n_bit=4, group_size=128):
         w, scales, zeros, min_int, max_int, torch.int8, group_size
     )
 
-    scales_and_zeros = pack_scales_and_zeros(scales, zeros)
-    return w_int8, scales_and_zeros
+    return w_int8, scales, zeros
 
 
 quantized_decomposed_lib.define(
@@ -584,19 +578,6 @@ def dequantize_per_channel_group(
     return w_dq
 
 
-def group_dequantize_tensor_symmetric(
-    w_int8, scales_and_zeros, n_bit=4, group_size=128
-):
-    # TODO: remove this
-    scales, zero_points = unpack_scales_and_zeros(scales_and_zeros)
-    n_bit = 4
-    quant_min = -(2 ** (n_bit - 1))
-    quant_max = 2 ** (n_bit - 1) - 1
-    return torch.ops.quantized_decomposed.quantize_per_channel_group(
-        w_int8, scales, zero_points, quant_min, quant_max, torch.int8, group_size
-    )
-
-
 def down_size(size):
     assert size[-1] % 2 == 0, f"{size} last dim not divisible by two"
     return (*size[:-1], size[-1] // 2)
@@ -632,6 +613,25 @@ def unpack_int4_to_int8(int8_data: torch.Tensor) -> torch.Tensor:
     return torch.stack([first_elements, second_elements], dim=-1).view(up_size(shape))
 
 
+def per_token_dynamic_quant(input: torch.Tensor) -> torch.Tensor:
+    orig_dtype = input.dtype
+    (
+        scales,
+        zero_points,
+    ) = torch.ops.quantized_decomposed.choose_qparams_per_token(input, torch.int8)
+
+    # TODO: get these from torch.int8
+    quant_min = -128
+    quant_max = 127
+    input = torch.ops.quantized_decomposed.quantize_per_token(
+        input, scales, zero_points, quant_min, quant_max, torch.int8
+    )
+    input = torch.ops.quantized_decomposed.dequantize_per_token(
+        input, scales, zero_points, quant_min, quant_max, torch.int8, orig_dtype
+    )
+    return input
+
+
 class QuantHandler:
     def __init__(self, mod):
         self.mod = mod
@@ -900,29 +900,31 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
 ##### weight only int4 per channel groupwise quantized code ######
 
 
-def prepare_int4_weight_and_scales_and_zeros(weight_bf16, group_size, inner_k_tiles):
-    weight_int8, scales_and_zeros = group_quantize_tensor_symmetric(
-        weight_bf16, n_bit=4, group_size=group_size
+def prepare_int4_weight_and_scales_and_zeros(weight, group_size, precision):
+    weight_int8, scales, zeros = group_quantize_tensor_symmetric(
+        weight,
+        n_bit=4,
+        group_size=group_size,
+        precision=precision,
     )
     # TODO: better API
     # weight_int4packed = torch.ops.quantized_decomposed.pack_int4_from_int8(weight_int8)
-    return weight_int8, scales_and_zeros
+    return weight_int8, scales, zeros
 
 
-def linear_forward_int4(
-    x, weight_int8, scales_and_zeros, out_features, group_size, precision
+def linear_forward_8da4w(
+    x, weight_int8, scales, zeros, out_features, group_size, precision
 ):
+    x = per_token_dynamic_quant(x)
+    # TODO: verify and remove following reshape code
+    # origin_x_size = x.size()
+    # x = x.reshape(-1, origin_x_size[-1])
 
-    origin_x_size = x.size()
-    x = x.reshape(-1, origin_x_size[-1])
     # TODO: better API
-    # TODO: remove?
-    scales_and_zeros = scales_and_zeros.to(torch.float)
+    # weight_int8 = torch.ops.quantized_decomposed.unpack_int4_to_int8(weight_int4packed)
     n_bit = 4
     quant_min = -(2 ** (n_bit - 1))
     quant_max = 2 ** (n_bit - 1) - 1
-    scales, zeros = unpack_scales_and_zeros(scales_and_zeros)
-    # weight_int8 = torch.ops.quantized_decomposed.unpack_int4_to_int8(weight_int4packed)
     w_dq = torch.ops.quantized_decomposed.dequantize_per_channel_group(
         weight_int8,
         scales,
@@ -936,10 +938,11 @@ def linear_forward_int4(
 
     # x = x.to(torch.float16)
     # w_dq = w_dq.to(torch.float16)
-    c = torch.ops.aten.linear(x, w_dq)
+    c = torch.nn.functional.linear(x, w_dq)
+
+    # new_shape = origin_x_size[:-1] + (out_features,)
+    # c = c.reshape(new_shape)
 
-    new_shape = origin_x_size[:-1] + (out_features,)
-    c = c.reshape(new_shape)
     return c
 
 
@@ -950,28 +953,24 @@ def find_multiple(n: int, *args: Tuple[int]) -> int:
     return n + k - (n % k)
 
 
-def _check_linear_int4_k(k, group_size=1, inner_k_tiles=1):
-    return k % group_size == 0 and k % (inner_k_tiles * 16) == 0
+def _check_linear_int4_k(k, group_size=1):
+    return k % group_size == 0
 
 
-def _calc_padded_size_linear_int4(k, groupsize=1, inner_k_tiles=1):
-    return find_multiple(k, groupsize, inner_k_tiles * 16)
+def _calc_padded_size_linear_int4(k, groupsize=1):
+    return find_multiple(k, groupsize)
 
 
 def replace_linear_8da4w(
     module,
     group_size,
-    inner_k_tiles,
     padding_allowed,
-    activation_precision,
-    weight_precision,
+    precision,
+    scales_precision,
 ):
     for name, child in module.named_children():
         if isinstance(child, nn.Linear):
-            if (
-                _check_linear_int4_k(child.in_features, group_size, inner_k_tiles)
-                or padding_allowed
-            ):
+            if _check_linear_int4_k(child.in_features, group_size) or padding_allowed:
                 setattr(
                     module,
                     name,
@@ -980,19 +979,17 @@ def replace_linear_8da4w(
                         child.out_features,
                         bias=False,
                         group_size=group_size,
-                        inner_k_tiles=inner_k_tiles,
-                        activation_precision=activation_precision,
-                        weight_precision=weight_precision,
+                        precision=precision,
+                        scales_precision=scales_precision,
                     ),
                 )
         else:
             replace_linear_8da4w(
                 child,
                 group_size,
-                inner_k_tiles,
                 padding_allowed,
-                activation_precision,
-                weight_precision,
+                precision,
+                scales_precision,
             )
 
 
@@ -1000,20 +997,17 @@ class Int8DynActInt4WeightQuantHandler:
     def __init__(
         self,
         mod,
-        group_size=128,
-        inner_k_tiles=8,
-        padding_allowed=True,
-        activation_precision=torch.float16,
-        weight_precision=torch.float16,
+        group_size=256,
+        padding_allowed=False,
+        precision=torch.float32,
+        scales_precision=torch.float32,
     ):
         self.mod = mod
         self.group_size = group_size
-        self.inner_k_tiles = inner_k_tiles
         self.padding_allowed = padding_allowed
-        self.activation_precision = activation_precision
-        self.weight_precision = weight_precision
-        assert group_size in [32, 64, 128, 256]
-        assert inner_k_tiles in [2, 4, 8]
+        self.precision = precision
+        self.scales_precision = scales_precision
+        # assert group_size in [32, 64, 128, 256]
 
     @torch.no_grad()
     def create_quantized_state_dict(self):
@@ -1027,37 +1021,43 @@ def create_quantized_state_dict(self):
                 # assert out_features % 8 == 0, "require out_features % 8 == 0"
                 print(f"linear: {fqn}, in={in_features}, out={out_features}")
 
+                assert (
+                    in_features % self.group_size == 0
+                ), f"require in_features:{in_features} % self.group_size:{self.group_size} == 0"
+
                 weight = mod.weight.data
+                """
                 if not _check_linear_int4_k(
-                    in_features, self.group_size, self.inner_k_tiles
+                    in_features, self.group_size
                 ):
                     if self.padding_allowed:
                         print(
                             f"warning: {fqn} is padded to satisfy in_features % 1024 == 0"
                         )
                         padded_in_features = _calc_padded_size_linear_int4(
-                            in_features, 1024
+                            in_features, self.group_size
                         )
                         weight = F.pad(
                             weight, pad=(0, padded_in_features - in_features)
                         )
                     else:
-                        print(
+                        raise RuntimeError(
                             f"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, "
-                            + "and that group_size and inner_k_tiles*16 evenly divide into it"
+                            + "and that group_size"
                         )
-
-                        continue
+                """
                 (
                     weight_int4pack,
-                    scales_and_zeros,
+                    scales,
+                    zeros,
                 ) = prepare_int4_weight_and_scales_and_zeros(
-                    weight.to(self.weight_precision),
+                    weight.to(self.precision),
                     self.group_size,
-                    self.inner_k_tiles,
+                    self.scales_precision,
                 )
                 cur_state_dict[f"{fqn}.weight"] = weight_int4pack.to("cpu")
-                cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to("cpu")
+                cur_state_dict[f"{fqn}.scales"] = scales.to("cpu")
+                cur_state_dict[f"{fqn}.zeros"] = zeros.to("cpu")
 
         return cur_state_dict
 
@@ -1065,10 +1065,9 @@ def convert_for_runtime(self):
         replace_linear_8da4w(
             self.mod,
             self.group_size,
-            self.inner_k_tiles,
             self.padding_allowed,
-            self.activation_precision,
-            self.weight_precision,
+            self.precision,
+            self.scales_precision,
         )
         return self.mod
 
@@ -1086,6 +1085,15 @@ class Int8DynActInt4WeightLinear(torch.nn.Module):
     out_features: int
     weight: torch.Tensor
 
+    """
+    This module implements a dynamic quantized linear layer with int4 weight.
+    Weights are per channel groupwise quantized. Parameters of importance
+    group_size: the number of elements in each quantized group
+    precision: precision of input and output. e.g. torch.float32 means input
+    activation is float32 and output is float32.
+    scales_precision: precision of per group scale.
+    """
+
     def __init__(
         self,
         in_features: int,
@@ -1093,73 +1101,298 @@ def __init__(
         bias=True,
         device=None,
         dtype=None,
-        group_size: int = 128,
-        inner_k_tiles: int = 8,
-        activation_precision: torch.dtype = torch.float16,
-        weight_precision: torch.dtype = torch.float16,
+        group_size: int = 256,
+        precision: torch.dtype = torch.float32,
+        scales_precision: torch.dtype = torch.float32,
     ) -> None:
         super().__init__()
         # always pad if needed since it becomes a noop at runtime if not needed
-        self.origin_in_features = in_features
-        in_features = _calc_padded_size_linear_int4(
-            in_features, group_size, inner_k_tiles
-        )
+        # self.origin_in_features = in_features
+        assert (
+            in_features % group_size == 0
+        ), f"require in_features:{in_features} % group_size:{group_size} == 0"
+        # in_features = _calc_padded_size_linear_int4(
+        #    in_features, group_size
+        # )
         self.in_features = in_features
         self.out_features = out_features
         assert not bias, "require bias=False"
         self.group_size = group_size
-        self.inner_k_tiles = inner_k_tiles
-        self.weight_precision = weight_precision
-        self.activation_precision = activation_precision
+        # Precision of the activation which also indicates
+        # output precision of the dynamically quantized linear layer
+        # that his module represents.
+        self.precision = precision
 
-        # assert out_features % 8 == 0, "require out_features % 8 == 0"
-        assert (
-            in_features % (inner_k_tiles * 16) == 0
-        ), "require in_features % (innerKTiles * 16) == 0"
         # currently storing unpacked int8 weights
         self.register_buffer(
             "weight",
             torch.empty((out_features, in_features), dtype=torch.int8),
         )
         self.register_buffer(
-            "scales_and_zeros",
+            "scales",
             torch.empty(
-                (in_features // group_size, out_features, 2),
-                dtype=self.weight_precision,
+                (out_features, in_features // group_size),
+                dtype=scales_precision,
+            ),
+        )
+        self.register_buffer(
+            "zeros",
+            torch.empty(
+                (out_features, in_features // group_size),
+                dtype=scales_precision,
             ),
         )
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
-        input = input.to(self.activation_precision)
-        input = F.pad(input, pad=(0, self.in_features - self.origin_in_features))
-
-        (
-            scales,
-            zero_points,
-        ) = torch.ops.quantized_decomposed.choose_qparams_per_token(input, torch.int8)
-
-        # TODO: get these from torch.int8
-        quant_min = -128
-        quant_max = 127
-        input = torch.ops.quantized_decomposed.quantize_per_token(
-            input, scales, zero_points, quant_min, quant_max, torch.int8
-        )
-        input = torch.ops.quantized_decomposed.dequantize_per_token(
+        input = input.to(self.precision)
+        # padding is removed for perf
+        # input = F.pad(input, pad=(0, self.in_features - self.origin_in_features))
+        return linear_forward_8da4w(
             input,
-            scales,
-            zero_points,
+            self.weight,
+            self.scales,
+            self.zeros,
+            self.out_features,
+            self.groupsize,
+            self.precision,
+        )
+
+
+#### GPTQ ########
+
+try:
+    from GPTQ import (  # pyre-ignore[21]
+        evaluate,
+        GenericGPTQRunner,
+        get_task_dict,
+        InputRecorder,
+        lm_eval,
+        MultiInput,
+    )
+
+except:
+    pass
+
+
+class GPTQQuantHandler(QuantHandler):
+    """
+    This class implements a GPTQ QuantHandler that can be used to apply GPTQ to a model in concert with the GenericGPTQRunner class.
+    Unlike the base QuantHandler class, the user does not need to implement the create_quantized_state_dict, instead they have to reimplement
+    __init__ such that it defines the functions for the quantization mode. User is expected to reimplement convert_for_runtime.
+
+    The following functions (which must be defined in __init__) are used to define the quantization mode for both GPTQ and
+    create_quantized_state_dict. Here is a description of each function.
+
+    get_qparams_func:
+        A function that calculates the quantization qparams for an input tensor.
+        Args:
+            weight: A 2d weight tensor with non-integer dtype.
+        Returns:
+            qparams: it can have any format but will need to be handled by the other defined functions below.
+
+    quantize_func:
+        A function that applies quantization to an input tensor. It should be noted
+        that this function needs to be able to handle quantizing the entire weight tensor, a single group,
+        or a single column.
+        Args:
+            weight: A 2d weight tensor with non-integer dtype.
+            qparams: the output from get_qparams_func
+        Returns:
+            quantized_weight: A 2d quantized weight tensor (generally with an integer dtype)
+
+
+    dequantize_func:
+        A function that dequantizes an input quantized weight tensor. It should be noted
+        that this function needs to be able to handle dequantizing the entire weight tensor, a single group,
+        or a single column.
+        Args:
+            quantized_weight: A 2d quantized weight tensor (generally with an integer dtype)
+            qparams: the output from get_qparams_func
+        Returns:
+            weight: A 2d weight tensor with non-integer dtype.
+
+    combine_qparams_list_func:
+        A function that combines several qparams into one qparam.
+        Args:
+            qparams_list: a list of qparams objects, each obtained by calling get_qparams_func
+            on a single group from a weight tensor
+        Returns:
+            qparams: an object of the same format as the qparams above.
+
+    skip_layer_func:
+        A function that determines which linear layers should be skipped during GPTQ
+        Args:
+            weight: A 2d weight tensor with non-integer dtype.
+        Returns:
+            skip: boolean indicating whether layer should be skipped
+
+    make_names_and_values_dict_func:
+        A function that prepares the qparams and quantized_weight and creates a dictionary indicating how they
+        should be inserted into the state_dict. Generally any packing of the weight and qparams should be done here.
+        Args:
+            quantized_weight: A 2d quantized weight tensor (generally with an integer dtype)
+            qparams: the output from get_qparams_func
+        Returns:
+            names_and_values_dict: a dictionary mapping the name of the parameters of the quantized module to the
+            corresponding quantized weights and qparams.
+    """
+
+    def __init__(self):
+        assert self.mod is not None
+        assert self.get_qparams_func is not None
+        assert self.quantize_func is not None
+        assert self.dequantize_func is not None
+        assert self.combine_qparams_list_func is not None
+        assert self.make_names_and_values_dict_func is not None
+
+    @staticmethod
+    def get_inputs(
+        model,
+        tokenizer,
+        calibration_tasks,
+        calibration_limit,
+        calibration_seq_length,
+        pad_calibration_inputs,
+    ) -> "MultiInput":  # pyre-ignore[11]
+        input_recorder = InputRecorder(
+            model,
+            tokenizer,
+            calibration_seq_length,
+            pad_calibration_inputs,
+        )
+
+        try:
+            lm_eval.tasks.initialize_tasks()
+        except:
+            pass
+        task_dict = get_task_dict(calibration_tasks)
+        print("Obtaining GPTQ calibration inputs on: ", calibration_tasks)
+
+        evaluate(
+            input_recorder,
+            task_dict,
+            limit=calibration_limit,
+        )
+        inputs = input_recorder.get_recorded_inputs()
+        assert inputs is not None, (
+            f"No inputs were collected, use a task other than {calibration_tasks}, "
+            + "use option pad_calibration_inputs, or decrease calibration_sequence_length (currently "
+            + f"{calibration_seq_length})"
+        )
+        print(f"Obtained {len(inputs[0].values)} calibration samples")
+        return inputs
+
+    @torch.no_grad()
+    def create_quantized_state_dict(
+        self,
+        tokenizer,
+        blocksize,
+        percdamp,
+        groupsize,
+        calibration_tasks,
+        calibration_limit,
+        calibration_seq_length,
+        pad_calibration_inputs,
+    ) -> Dict:
+        inputs = GPTQQuantHandler.get_inputs(
+            self.mod,
+            tokenizer,
+            calibration_tasks,
+            calibration_limit,
+            calibration_seq_length,
+            pad_calibration_inputs,
+        )
+        print("Tracing model for GPTQ")
+        GPTQ_runner = GenericGPTQRunner(
+            self.mod,
+            inputs,
+            blocksize,
+            percdamp,
+            groupsize,
+        ).configure_quantization_mode(
+            self.get_qparams_func,  # pyre-ignore[16]
+            self.quantize_func,  # pyre-ignore[16]
+            self.dequantize_func,  # pyre-ignore[16]
+            self.combine_qparams_list_func,  # pyre-ignore[16]
+            self.make_names_and_values_dict_func,  # pyre-ignore[16]
+            self.skip_layer_func,  # pyre-ignore[16]
+        )
+
+        print("Applying GPTQ to weights")
+        GPTQ_runner.run()
+        return GPTQ_runner.get_quantized_state_dict()
+
+    def convert_for_runtime(self) -> "nn.Module":
+        pass
+
+
+class Int8DynActInt4WeightGPTQQuantHandler(GPTQQuantHandler):
+    def __init__(
+        self,
+        groupsize=128,
+        inner_k_tiles=8,
+        padding_allowed=True,
+        precision=torch.float32,
+    ):
+
+        self.groupsize = groupsize
+        self.inner_k_tiles = inner_k_tiles
+        self.padding_allowed = padding_allowed
+        self.precision = precision
+        self.dyn_quant_func = lambda x: per_token_dynamic_quant(x)
+        n_bit = 4
+        self.get_qparams_func = lambda w: get_group_qparams_symmetric(
+            w, n_bit, groupsize, self.precision
+        )
+        quant_min = -(2 ** (n_bit - 1))
+        quant_max = 2 ** (n_bit - 1) - 1
+        self.quantize_func = lambda w, qparams: torch.ops.quantized_decomposed.quantize_per_channel_group(
+            w, qparams[0], qparams[1], quant_min, quant_max, torch.int8, groupsize
+        )
+        self.dequantize_func = lambda q, qparams: torch.ops.quantized_decomposed.dequantize_per_channel_group(
+            q,
+            qparams[0],
+            qparams[1],
             quant_min,
             quant_max,
             torch.int8,
-            self.activation_precision,
+            groupsize,
+            self.precision,
+        )
+        self.combine_qparams_list_func = lambda qparams_list: [
+            torch.cat(x, dim=1) for x in zip(*qparams_list)
+        ]
+        # skip unless padding_allowed=True or its correctly sized
+        self.skip_layer_func = lambda linear_weight: not (
+            _check_linear_int4_k(linear_weight.shape[-1], groupsize, inner_k_tiles)
+            or padding_allowed
         )
 
-        input = input.to(self.activation_precision)
-        return linear_forward_int4(
-            input,
-            self.weight,
-            self.scales_and_zeros,
-            self.out_features,
-            self.group_size,
-            self.weight_precision,
+        # we need to do the padding here, both for q and the qparams if necessary
+        def make_names_and_values_dict_func(q, qparams):
+            k = q.shape[1]
+            new_k = _calc_padded_size_linear_int4(k, groupsize, inner_k_tiles)
+            # how much we need to pad the weight
+            delta_k = new_k - q.shape[1]
+            final_q = F.pad(q, pad=(0, delta_k))
+            scales_and_zeros = pack_scales_and_zeros(*qparams, precision=self.precision)
+            # how many new groups we need for padded weight
+            delta_groups = new_k // groupsize - scales_and_zeros.shape[0]
+            # TODO: split scales and zero_points
+            final_s_and_z = F.pad(
+                scales_and_zeros, pad=(0, 0, 0, 0, 0, delta_groups), value=1
+            )
+            return {"weight": final_q, "scales_and_zeros": final_s_and_z}
+
+        self.make_names_and_values_dict_func = make_names_and_values_dict_func
+        super().__init__()
+
+    def convert_for_runtime(self, model):
+        replace_linear_8da4w(
+            model,
+            self.groupsize,
+            self.padding_allowed,
+            torch.int8,
+            self.precision,
         )
+        return model
diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt
index a4e8bf4851..e358c7746b 100644
--- a/examples/models/llama2/runner/CMakeLists.txt
+++ b/examples/models/llama2/runner/CMakeLists.txt
@@ -16,6 +16,10 @@
 # It should also be cmake-lint clean.
 #
 
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
+endif()
+
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 #
@@ -34,7 +38,14 @@ list(TRANSFORM _llama_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 target_include_directories(extension_module
                            INTERFACE ${_common_include_directories})
 
-add_library(llama_runner SHARED ${_llama_runner__srcs})
+if(CMAKE_TOOLCHAIN_IOS OR CMAKE_TOOLCHAIN_ANDROID)
+  # Building a share library on iOS requires code signing
+  # On Android we see duplicated registration when using shared lib
+  add_library(llama_runner STATIC ${_llama_runner__srcs})
+else()
+  add_library(llama_runner SHARED ${_llama_runner__srcs})
+endif()
+
 target_link_libraries(
   llama_runner PUBLIC executorch portable_kernels extension_module
                       extension_data_loader)
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
index 4d53442314..f3e81b7d10 100644
--- a/examples/models/llama2/runner/runner.cpp
+++ b/examples/models/llama2/runner/runner.cpp
@@ -14,6 +14,7 @@
 
 #include <ctime>
 #include <memory>
+#include <sstream>
 
 #ifdef USE_ATEN_LIB
 #include <torch/torch.h>
@@ -161,8 +162,16 @@ Error Runner::generate(
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
-  ET_CHECK_OK_OR_RETURN_ERROR(load());
+  if (!is_loaded()) {
+    timers_.model_load_start_ms = util::time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    timers_.model_load_end_ms = util::time_in_ms();
+  }
+
+  // First token time only measures the time it takes to encode the prompt and
+  // return a response token.
 
+  timers_.inference_start_ms = util::time_in_ms();
   shouldStop_ = false;
 
   // encode the (string) prompt into tokens sequence
@@ -179,6 +188,7 @@ Error Runner::generate(
       append_eos_ ? n_eos_ : 0,
       prompt_tokens,
       &num_prompt_tokens);
+
   for (int i = 0; i < num_prompt_tokens; i++) {
     ET_LOG(Info, "prompt_tokens[%d]: %d", i, prompt_tokens[i]);
   }
@@ -192,12 +202,9 @@ Error Runner::generate(
       "Sequence length exceeded - please increase the seq_len value passed to generate()");
 
   // start the main loop
-  long start =
-      0; // used to time our code, only initialized after first iteration
   int next; // will store the next token in the sequence
   int64_t pos = num_prompt_tokens - 1; // position in the sequence
   int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens
-  int eos_counter = 0; // counter to capture EOS
   int logits_index = 0; // index of the logits tensor in the output
   int k_cache_index = 0;
   int v_cache_index = 0;
@@ -255,6 +262,7 @@ Error Runner::generate(
               tokenizer_->decode(prompt_tokens[i - 1], prompt_tokens[i])));
     }
   }
+
   // create a 1xN int tensor with next as value
   while (pos < seq_len) {
     // ET_LOG(Info, "Generating step %d...", pos);
@@ -290,10 +298,14 @@ Error Runner::generate(
         outputs.size() > 0,
         "Expecting output to have at least one evalue. Got %zu",
         outputs.size());
-
+    if (pos == num_prompt_tokens) {
+      timers_.first_token_ms = util::time_in_ms();
+    } else if (pos == num_prompt_tokens - 1) {
+      timers_.prompt_eval_end_ms = util::time_in_ms();
+    }
     int32_t next_tok;
     exec_aten::Tensor logits_tensor = outputs.at(logits_index).toTensor();
-
+    long sample_start_time_ms = util::time_in_ms();
     switch (logits_tensor.scalar_type()) {
       case ScalarType::Float: {
         next_tok = logitsToToken<float>(logits_tensor, pos, 0);
@@ -309,6 +321,8 @@ Error Runner::generate(
             "Unsupported dtype output %hhd",
             static_cast<int8_t>(logits_tensor.scalar_type()));
     }
+    timers_.aggregate_sampling_time_ms +=
+        util::time_in_ms() - sample_start_time_ms;
 
     // advance the state machine
     if (pos < num_prompt_tokens - 1) {
@@ -340,21 +354,13 @@ Error Runner::generate(
 
     // data-dependent terminating condition: we have n_eos_ number of EOS
     if (pos >= num_prompt_tokens && next == eos_id_) {
-      eos_counter++;
-      if (eos_counter == n_eos_) {
-        ET_LOG(Info, "Reached to the end of generation");
-        break;
-      }
-    } else {
-      eos_counter = 0;
+      printf("\n");
+      ET_LOG(Info, "\nReached to the end of generation");
+      break;
     }
 
     token = next;
 
-    // init the timer here because the first iteration can be slower
-    if (start == 0) {
-      start = util::time_in_ms();
-    }
     if (use_kv_cache_) {
       // outputs: [k_cache, v_cache, logits, k_cache, v_cache]
       memcpy(
@@ -367,23 +373,97 @@ Error Runner::generate(
           v_data.size());
     }
   }
+  timers_.inference_end_ms = util::time_in_ms();
   printf("\n");
 
   if (pos == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
   }
-  // report achieved tok/s (pos-1 because the timer starts after first
-  // iteration)
-  if (pos >= 1) {
-    long end = util::time_in_ms();
-    ET_LOG(
-        Info, "Achieved tok/s: %f\n", (pos - 1) / (double)(end - start) * 1000);
-  }
+
+  timers_.printReport(num_prompt_tokens, pos - num_prompt_tokens);
 
   delete[] prompt_tokens;
   return Error::Ok;
 }
 
+void Runner::TimeStamps::printReport(
+    const int64_t& num_prompt_tokens,
+    const int64_t& num_generated_tokens) {
+  printf(
+      "PyTorchObserver %s\n",
+      toJsonString(num_prompt_tokens, num_generated_tokens).c_str());
+
+  ET_LOG(
+      Info,
+      "\tPrompt Tokens: %" PRIu64 "    Generated Tokens: %" PRIu64,
+      num_prompt_tokens,
+      num_generated_tokens);
+
+  ET_LOG(
+      Info,
+      "\tModel Load Time:\t\t%f (seconds)",
+      ((double)(model_load_end_ms - model_load_start_ms) /
+       SCALING_FACTOR_UNITS_PER_SECOND));
+  double inference_time_ms = (double)(inference_end_ms - inference_start_ms);
+  ET_LOG(
+      Info,
+      "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      inference_time_ms / SCALING_FACTOR_UNITS_PER_SECOND,
+
+      (num_generated_tokens) / (double)(inference_end_ms - inference_start_ms) *
+          SCALING_FACTOR_UNITS_PER_SECOND);
+  double prompt_eval_time = (double)(prompt_eval_end_ms - inference_start_ms);
+  ET_LOG(
+      Info,
+      "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      prompt_eval_time / SCALING_FACTOR_UNITS_PER_SECOND,
+      (num_prompt_tokens) / prompt_eval_time * SCALING_FACTOR_UNITS_PER_SECOND);
+
+  double eval_time = (double)(inference_end_ms - prompt_eval_end_ms);
+  ET_LOG(
+      Info,
+      "\t\tGenerated %" PRIu64
+      " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      num_generated_tokens,
+      eval_time / SCALING_FACTOR_UNITS_PER_SECOND,
+      num_generated_tokens / eval_time * SCALING_FACTOR_UNITS_PER_SECOND);
+
+  // Time to first token is measured from the start of inference, excluding
+  // model load time.
+  ET_LOG(
+      Info,
+      "\tTime to first generated token:\t%f (seconds)",
+      ((double)(first_token_ms - inference_start_ms) /
+       SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tSampling time over %" PRIu64
+      " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      num_prompt_tokens + num_generated_tokens,
+      (double)aggregate_sampling_time_ms / SCALING_FACTOR_UNITS_PER_SECOND,
+      (num_prompt_tokens + num_generated_tokens) /
+          (double)aggregate_sampling_time_ms * SCALING_FACTOR_UNITS_PER_SECOND);
+}
+
+const std::string Runner::TimeStamps::toJsonString(
+    const int64_t& num_prompt_tokens,
+    const int64_t& num_generated_tokens) {
+  std::stringstream ss;
+  ss << "{\"prompt_tokens\":" << num_prompt_tokens << ","
+     << "\"generated_tokens\":" << num_generated_tokens << ","
+     << "\"model_load_start_ms\":" << model_load_start_ms << ","
+     << "\"model_load_end_ms\":" << model_load_end_ms << ","
+     << "\"inference_start_ms\":" << inference_start_ms << ","
+     << "\"inference_end_ms\":" << inference_end_ms << ","
+     << "\"prompt_eval_end_ms\":" << prompt_eval_end_ms << ","
+     << "\"first_token_ms\":" << first_token_ms << ","
+     << "\"aggregate_sampling_time_ms\":" << aggregate_sampling_time_ms << ","
+     << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
+     << SCALING_FACTOR_UNITS_PER_SECOND << "}";
+  return ss.str();
+}
+
 void Runner::stop() {
   shouldStop_ = true;
 }
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
index cc254cb534..69aac699d4 100644
--- a/examples/models/llama2/runner/runner.h
+++ b/examples/models/llama2/runner/runner.h
@@ -63,6 +63,36 @@ class Runner {
   std::unique_ptr<Tokenizer> tokenizer_;
   std::unique_ptr<Sampler> sampler_;
   bool shouldStop_{false};
+
+  struct TimeStamps {
+    // Scaling factor for timestamps - in this case, we use ms.
+    const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
+    // Time stamps for the different stages of the execution
+    // model_load_start_ms: Start of model loading.
+    long model_load_start_ms;
+    // model_load_end_ms: End of model loading.
+    long model_load_end_ms;
+    // inference_start_ms: Immediately after the model is loaded (or we check
+    // for model load), measure the inference time.
+    long inference_start_ms;
+    // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
+    // before the inference loop starts
+    long prompt_eval_end_ms;
+    // first_token: Timestamp when the first generated token is emitted
+    long first_token_ms;
+    // inference_end_ms: End of inference/generation.
+    long inference_end_ms;
+    // Keep a running total of the time spent in sampling.
+    long aggregate_sampling_time_ms;
+
+    void printReport(
+        const int64_t& num_prompt_tokens,
+        const int64_t& num_generated_tokens);
+    const std::string toJsonString(
+        const int64_t& num_prompt_tokens,
+        const int64_t& num_generated_tokens);
+  };
+  TimeStamps timers_;
 };
 
 } // namespace torch::executor
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
index 57d25be8fa..72e8443b8b 100644
--- a/examples/models/llama2/runner/targets.bzl
+++ b/examples/models/llama2/runner/targets.bzl
@@ -4,9 +4,9 @@ def _get_operator_lib(aten = False):
     if aten:
         return ["//executorch/kernels/portable:generated_lib_aten"]
     elif runtime.is_oss:
-        return ["//executorch/kernels/portable:generated_lib_all_ops"]
+        return ["//executorch/kernels/portable:generated_lib"]
     else:
-        return ["//executorch/kernels/portable:generated_lib_all_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops"]
+        return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"]
 
 def define_common_targets():
     for aten in (True, False):
@@ -36,7 +36,11 @@ def define_common_targets():
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/kernels/quantized:generated_lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
-            ] + (_get_operator_lib(aten)),
+            ] + (_get_operator_lib(aten)) + ([
+                # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
+                # Therefore enable it explicitly for now to avoid failing tests
+                "//executorch/backends/vulkan:vulkan_backend_lib",
+            ] if native.read_config("llama", "use_vulkan", "0") == "1" else []),
             external_deps = [
                 "libtorch",
             ] if aten else [],
diff --git a/examples/portable/custom_ops/custom_ops_1.py b/examples/portable/custom_ops/custom_ops_1.py
index 97bdf67491..4abb15b972 100644
--- a/examples/portable/custom_ops/custom_ops_1.py
+++ b/examples/portable/custom_ops/custom_ops_1.py
@@ -51,7 +51,7 @@ def main():
         (input,),
         edge_compile_config=EdgeCompileConfig(_check_ir_validity=False),
     )
-    save_pte_program(prog.buffer, model_name)
+    save_pte_program(prog, model_name)
 
 
 if __name__ == "__main__":
diff --git a/examples/portable/custom_ops/custom_ops_2.py b/examples/portable/custom_ops/custom_ops_2.py
index c937897834..b7b9fb47fd 100644
--- a/examples/portable/custom_ops/custom_ops_2.py
+++ b/examples/portable/custom_ops/custom_ops_2.py
@@ -32,7 +32,7 @@ def main():
         (input,),
         edge_compile_config=EdgeCompileConfig(_check_ir_validity=False),
     )
-    save_pte_program(prog.buffer, model_name)
+    save_pte_program(prog, model_name)
 
 
 if __name__ == "__main__":
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
index b890d5395d..9cddaa4ed7 100644
--- a/examples/portable/executor_runner/targets.bzl
+++ b/examples/portable/executor_runner/targets.bzl
@@ -47,7 +47,7 @@ def define_common_targets():
         deps = [
             ":executor_runner_lib",
             "//executorch/runtime/executor/test:test_backend_compiler_lib",
-            "//executorch/kernels/portable:generated_lib_all_ops",
+            "//executorch/kernels/portable:generated_lib",
         ] + custom_ops_lib,
         define_static_target = True,
         **get_oss_build_kwargs()
diff --git a/examples/portable/scripts/export.py b/examples/portable/scripts/export.py
index 728f51fc20..d4d524200f 100644
--- a/examples/portable/scripts/export.py
+++ b/examples/portable/scripts/export.py
@@ -9,6 +9,8 @@
 import argparse
 import logging
 
+import torch
+
 from executorch.exir.capture import EdgeCompileConfig, ExecutorchBackendConfig
 
 from ...models import MODEL_NAME_TO_MODEL
@@ -71,8 +73,9 @@ def main() -> None:
             dynamic_shapes=dynamic_shapes,
             backend_config=backend_config,
         )
-    save_pte_program(prog.buffer, args.model_name, args.output_dir)
+    save_pte_program(prog, args.model_name, args.output_dir)
 
 
 if __name__ == "__main__":
-    main()  # pragma: no cover
+    with torch.no_grad():
+        main()  # pragma: no cover
diff --git a/examples/portable/utils.py b/examples/portable/utils.py
index 776ed53b16..50ca074737 100644
--- a/examples/portable/utils.py
+++ b/examples/portable/utils.py
@@ -98,11 +98,17 @@ def export_to_exec_prog(
     return exec_prog
 
 
-def save_pte_program(buffer: bytes, model_name: str, output_dir: str = "") -> None:
-    filename = os.path.join(output_dir, f"{model_name}.pte")
+def save_pte_program(
+    prog: ExecutorchProgramManager, model_name: str, output_dir: str = ""
+) -> None:
+    if model_name.endswith(".pte"):
+        filename = model_name
+    else:
+        filename = os.path.join(output_dir, f"{model_name}.pte")
+
     try:
         with open(filename, "wb") as file:
-            file.write(buffer)
+            prog.write_to_file(file)
             logging.info(f"Saved exported program to {filename}")
     except Exception as e:
         logging.error(f"Error while saving to {filename}: {e}")
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index 58c3f23cdc..f988b8345c 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -74,4 +74,4 @@
     executorch_program = delegated_program.to_executorch(
         config=ExecutorchBackendConfig(extract_constant_segment=False)
     )
-    save_pte_program(executorch_program.buffer, args.model_name)
+    save_pte_program(executorch_program, args.model_name)
diff --git a/examples/sdk/scripts/export_bundled_program.py b/examples/sdk/scripts/export_bundled_program.py
index bbc04fabda..6c7db5d563 100644
--- a/examples/sdk/scripts/export_bundled_program.py
+++ b/examples/sdk/scripts/export_bundled_program.py
@@ -8,15 +8,11 @@
 
 import argparse
 
-from typing import List, Union
+from typing import List
 
 import torch
 
-from executorch.exir import (
-    ExecutorchProgram,
-    ExecutorchProgramManager,
-    MultiMethodExecutorchProgram,
-)
+from executorch.exir import ExecutorchProgramManager
 from executorch.sdk import BundledProgram
 from executorch.sdk.bundled_program.config import (
     MethodInputType,
@@ -33,11 +29,7 @@
 
 
 def save_bundled_program(
-    executorch_program: Union[
-        ExecutorchProgram,
-        MultiMethodExecutorchProgram,
-        ExecutorchProgramManager,
-    ],
+    executorch_program: ExecutorchProgramManager,
     method_test_suites: List[MethodTestSuite],
     output_path: str,
 ):
diff --git a/examples/sdk/sdk_example_runner/targets.bzl b/examples/sdk/sdk_example_runner/targets.bzl
index 6a89396665..a5e8feb33c 100644
--- a/examples/sdk/sdk_example_runner/targets.bzl
+++ b/examples/sdk/sdk_example_runner/targets.bzl
@@ -15,7 +15,7 @@ def define_common_targets():
         ],
         deps = [
             "//executorch/runtime/executor/test:test_backend_compiler_lib",
-            "//executorch/kernels/portable:generated_lib_all_ops",
+            "//executorch/kernels/portable:generated_lib",
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/data_loader:buffer_data_loader",
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index 3c7b4f273e..7975fcc16d 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -111,4 +111,4 @@
 
     quant_tag = "q8" if args.quantize else "fp32"
     model_name = f"{args.model_name}_xnnpack_{quant_tag}"
-    save_pte_program(exec_prog.buffer, model_name, args.output_dir)
+    save_pte_program(exec_prog, model_name, args.output_dir)
diff --git a/examples/xnnpack/quantization/example.py b/examples/xnnpack/quantization/example.py
index 9a7d22408a..4804af0b42 100644
--- a/examples/xnnpack/quantization/example.py
+++ b/examples/xnnpack/quantization/example.py
@@ -193,7 +193,7 @@ def main() -> None:
     prog = edge_m.to_executorch(
         config=ExecutorchBackendConfig(extract_constant_segment=False)
     )
-    save_pte_program(prog.buffer, f"{args.model_name}_quantized")
+    save_pte_program(prog, f"{args.model_name}_quantized")
     end = time.perf_counter()
     logging.info(f"Save time: {end - start}s")
     logging.info("finished")
diff --git a/examples/xnnpack/targets.bzl b/examples/xnnpack/targets.bzl
index 599cee7372..9f89371ea0 100644
--- a/examples/xnnpack/targets.bzl
+++ b/examples/xnnpack/targets.bzl
@@ -56,7 +56,7 @@ def define_common_targets():
         deps = [
             "//executorch/examples/portable/executor_runner:executor_runner_lib",
             "//executorch/backends/xnnpack:xnnpack_backend",
-            "//executorch/kernels/portable:generated_lib_all_ops",
+            "//executorch/kernels/portable:generated_lib",
         ],
         define_static_target = True,
         **get_oss_build_kwargs()
diff --git a/examples/xtensa/aot/export_example.py b/examples/xtensa/aot/export_example.py
index a9e2fb9c64..17aee92062 100644
--- a/examples/xtensa/aot/export_example.py
+++ b/examples/xtensa/aot/export_example.py
@@ -90,4 +90,4 @@ def forward(self, x: torch.Tensor):
     logging.info(f"Final exported graph:\n{exec_prog.exported_program().graph}")
 
     # Save the program as XtensaDemoModel.pte
-    save_pte_program(exec_prog.buffer, "XtensaDemoModel")
+    save_pte_program(exec_prog, "XtensaDemoModel")
diff --git a/examples/xtensa/ops/dequantize_per_tensor.cpp b/examples/xtensa/ops/dequantize_per_tensor.cpp
index dcc4ace789..26b6c71ca9 100644
--- a/examples/xtensa/ops/dequantize_per_tensor.cpp
+++ b/examples/xtensa/ops/dequantize_per_tensor.cpp
@@ -25,7 +25,15 @@ void dequantize_per_tensor_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
+    exec_aten::optional<ScalarType>& out_dtype,
     Tensor& out) {
+  if (out_dtype.has_value()) {
+    ET_CHECK_MSG(
+        out_dtype.value() == ScalarType::Float,
+        "Expected out dtype to be Float but got %hhd",
+        out_dtype.value());
+  }
+
   float* out_data = out.mutable_data_ptr<float>();
   size_t numel = out.numel();
 
diff --git a/examples/xtensa/ops/functions.yaml b/examples/xtensa/ops/functions.yaml
index 07093d3ed2..f89be7bcf9 100644
--- a/examples/xtensa/ops/functions.yaml
+++ b/examples/xtensa/ops/functions.yaml
@@ -26,7 +26,7 @@
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_linear_pt2_out
 
-- func: xtensa::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+- func: xtensa::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
diff --git a/exir/__init__.py b/exir/__init__.py
index 9efd86d296..d71a2be064 100644
--- a/exir/__init__.py
+++ b/exir/__init__.py
@@ -10,7 +10,6 @@
     _capture_legacy_do_not_use,
     CallSpec,
     capture,
-    capture_multiple,
     CaptureConfig,
     EdgeCompileConfig,
     ExecutorchBackendConfig,
@@ -23,9 +22,6 @@
     ExecutorchProgram,
     ExecutorchProgramManager,
     ExirExportedProgram,
-    multi_method_program_to_executorch,
-    MultiMethodExecutorchProgram,
-    MultiMethodExirExportedProgram,
     to_edge,
 )
 from executorch.exir.tracer import ExirDynamoConfig
@@ -37,7 +33,6 @@
     "emit_program",
     "EmitterOutput",
     "capture",
-    "capture_multiple",
     "_capture_legacy_do_not_use",
     "CallSpec",
     "ExportedProgram",
@@ -49,12 +44,9 @@
     "EdgeProgramManager",
     "ExecutorchProgramManager",
     "edge_to_executorch_passes",
-    "MultiMethodExirExportedProgram",
-    "MultiMethodExecutorchProgram",
     "CaptureConfig",
     "EdgeCompileConfig",
     "ExecutorchBackendConfig",
     "Value",
-    "multi_method_program_to_executorch",
     "ExirDynamoConfig",
 ]
diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS
index 2b319d254b..b3c86953b4 100644
--- a/exir/_serialize/TARGETS
+++ b/exir/_serialize/TARGETS
@@ -29,6 +29,7 @@ runtime.python_library(
     name = "lib",
     srcs = [
         "__init__.py",
+        "_cord.py",
         "_dataclass.py",
         "_flatbuffer.py",
         "_program.py",
@@ -60,5 +61,6 @@ runtime.python_library(
     ],
     deps = [
         "//executorch/exir:schema",
+        "//executorch/exir:tensor",
     ],
 )
diff --git a/exir/_serialize/_cord.py b/exir/_serialize/_cord.py
new file mode 100644
index 0000000000..b8be3572e1
--- /dev/null
+++ b/exir/_serialize/_cord.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import io
+from typing import List, Optional, Union
+
+
+class Cord:
+    """A `bytes`-like sequence of bytes, stored non-contiguously.
+
+    Users can use a Cord to assemble large files and data blobs using references
+    to and slices of other data, instead of copying and appending that data to a
+    `bytes` or `bytearray` object.
+    """
+
+    def __init__(self, data: Optional[Union[bytes, "Cord"]] = None) -> None:
+        """Initialize Cord data structure."""
+        self._buffers: List[bytes] = []
+        self._byte_size: int = 0
+
+        if data is not None:
+            self.append(data)
+
+    def __len__(self):
+        """Number of bytes in the Cord."""
+        return self._byte_size
+
+    def __bytes__(self) -> bytes:
+        """Return the contents of the Cord as a single `bytes` object."""
+        return b"".join(self._buffers)
+
+    def append(self, data: Union[bytes, "Cord"]) -> None:
+        """Append a bytes or Cord to the current Cord."""
+        if isinstance(data, bytes):
+            self._buffers.append(data)
+            self._byte_size += len(data)
+        elif isinstance(data, Cord):
+            self._buffers.extend(data._buffers)
+            self._byte_size += len(data)
+        else:
+            raise TypeError(f"Can only append bytes or Cords, received {type(data)}")
+
+    def write_to_file(self, outfile: io.BufferedIOBase) -> None:
+        """Write the Cord to a file."""
+        for item in self._buffers:
+            outfile.write(item)
diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py
index 4135ce00cd..bb5bdc9aa7 100644
--- a/exir/_serialize/_program.py
+++ b/exir/_serialize/_program.py
@@ -13,6 +13,7 @@
 from dataclasses import dataclass
 from typing import ClassVar, List, Literal, Optional, Tuple
 
+from executorch.exir._serialize._cord import Cord
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 from executorch.exir._serialize._flatbuffer import (
     _FlatbufferResult,
@@ -29,6 +30,7 @@
     Program,
     SubsegmentOffsets,
 )
+from executorch.exir.tensor import ALIGNMENT
 
 
 # Byte order of numbers written to program headers. Always little-endian
@@ -240,15 +242,15 @@ def _get_extended_header(program_data: bytes) -> Optional[_ExtendedHeader]:
 
 
 def _extract_delegate_segments(
-    program: Program, segments: List[bytes], segment_alignment: int
+    program: Program,
+    segments: List[Cord],
 ) -> None:
-    """The input program and segments list are modified in place.
+    """Extracts the delegate segments inlined in the program into a list of buffers.
+        The program is modified in-place to remove the delegate data.
 
     Args:
         program: The program to extract segments from. Modified in-place.
-        segments: A list to which extracted segments will be appended. Modified in-place.
-        segment_alignment: Alignment in bytes. The starting offset of each
-            segment will be aligned to this value.
+        segments: A list of buffers to append extracted segments to. Modified in-place.
     """
     remaining_inline: List[BackendDelegateInlineData] = []
     inline_indices_seen: set[int] = set()
@@ -278,24 +280,11 @@ def _extract_delegate_segments(
             if inline.data:
                 # Move the delegate data out of the program.
                 segment_index = len(segments)
-                segments.append(inline.data)
+                segments.append(Cord(inline.data))
                 delegate.processed = BackendDelegateDataReference(
                     location=DataLocation.SEGMENT,
                     index=segment_index,
                 )
-
-                # Update the segment list in the root Program object.
-                prev_end = (
-                    program.segments[-1].offset + program.segments[-1].size
-                    if program.segments
-                    else 0
-                )
-                program.segments.append(
-                    DataSegment(
-                        offset=_aligned_size(prev_end, segment_alignment),
-                        size=len(inline.data),
-                    ),
-                )
             else:
                 # Not moving into a segment. Keep it inline, but update the
                 # index.
@@ -321,183 +310,32 @@ def _extract_delegate_segments(
 def _extract_constant_segment(
     constant_buffer: List[Buffer],
     tensor_alignment: int,
-) -> Tuple[bytes, List[int]]:
-    """Copies the tensors from the provided list into a single buffer and tracks the offsets
-    of each tensor.
+) -> Tuple[Cord, List[int]]:
+    """Copies the tensors from the provided list into a Cord and tracks the offsets
+        of each tensor.
 
+    Args:
         constant_buffer: list of Buffers from which to extract constants from. Not modified.
-        tensor_alignment: Alignment in bytes. The starting offset of each tensor in the
-            constant segment will be aligned to this value. Default to 16.
+        tensor_alignment: Alignment in bytes. Each tensor in the cord will be padded to align
+            with this value. Defaults to ALIGNMENT.
 
     Returns:
         A tuple of (constant segment, list of offsets for each tensor in the segment)
     """
-    constant_segment_data: bytearray = bytearray()
+    constant_segment_data: Cord = Cord()
     constant_segment_offsets: List[int] = []
     current_offset: int = 0
     for i in range(len(constant_buffer)):
         buffer = constant_buffer[i]
+        constant_segment_data.append(buffer.storage)
         buffer_length = len(buffer.storage)
         pad_length = _padding_required(buffer_length, tensor_alignment)
-
-        # Append each constant buffer to the constant segment.
-        constant_segment_data += buffer.storage
-        # Add padding for all but the last tensor.
         if i < len(constant_buffer) - 1:
-            constant_segment_data += b"\x00" * pad_length
-
-        # Append constant data offset.
+            constant_segment_data.append(b"\x00" * pad_length)
         constant_segment_offsets.append(current_offset)
         current_offset += buffer_length + pad_length
-    return bytes(constant_segment_data), constant_segment_offsets
-
-
-def _extract_segments(
-    program: Program,
-    extract_delegate_segments: bool,
-    extract_constant_segment: bool,
-    segment_alignment: int,
-    constant_tensor_alignment: int,
-) -> Tuple[Program, List[bytes]]:
-    """Extracts constant and/or delegate data from a given Program into separate segments.
 
-    Args:
-        program: The Program to extract segments from.
-        extract_delegate_segments: Whether to extract delegate data blobs from the program.
-        extract_constant_segment: Whether to extract constant data from the program.
-        segment_alignment: Alignment in bytes. The starting offset of each
-            segment will be aligned to this value in the output data.
-        constant_tensor_alignment: Alignment in bytes. The starting offset of each tensor
-            in the constant segment will be aligned to this value.
-    Returns:
-        A tuple of (modified program, list of segment data).
-    Raises:
-        ValueError, if the program already contains segments.
-    """
-    if program.segments:
-        raise ValueError(
-            f"Program already has {len(program.segments)} segments: "
-            + f"{repr(program.segments)}"
-        )
-
-    # Don't modify the original program.
-    # TODO(T144120904): Could avoid yet more huge copies with a more shallow
-    # copy, reusing the actual data blobs.
-    program = copy.deepcopy(program)
-
-    # Segment data to be written to the file following the flatbuffer data.
-    segments: List[bytes] = []
-
-    if extract_constant_segment:
-        constant_segment_data, constant_segment_offsets = _extract_constant_segment(
-            program.constant_buffer, tensor_alignment=constant_tensor_alignment
-        )
-
-        if constant_segment_data:
-            # Append constant_segment_data to the list of segments if non-empty.
-            segments.append(constant_segment_data)
-            # Append constant_segment offset to the list of DataSegments. Added as the
-            # first segment here, but it's not mandatory that the constant segment be
-            # the first one in the file.
-            program.segments.append(
-                DataSegment(offset=0, size=len(constant_segment_data))
-            )
-
-            # Fill in constant_segment offsets and clear the constant buffer; only one of
-            # constant_segment and constant_buffer should be non-empty.
-            program.constant_segment = SubsegmentOffsets(
-                segment_index=0, offsets=constant_segment_offsets
-            )
-            program.constant_buffer = []
-
-    if extract_delegate_segments:
-        _extract_delegate_segments(
-            program, segments=segments, segment_alignment=segment_alignment
-        )
-    return program, segments
-
-
-def _append_segments(
-    program_data: bytes,
-    segments: List[bytes],
-    alignment: int,
-    segment_table: List[DataSegment],
-    base_offset: int,
-) -> bytes:
-    """Appends segments to the end of the program data.
-
-    Appends each element of `segments` to `program_data`, with '\0' padding to
-    ensure that the offset of each segment is aligned to `alignment`.
-
-    Args:
-        program_data: The flatbuffer-serialized Program.
-        segments: The list of segments to append to `program_data`.
-        alignment: Alignment in bytes. The starting offset of each
-            segment will be aligned to this value in the output data.
-        segment_table: The expected offsets and sizes of each element in
-            `segments`. This is typically `program.segments`. Must have the
-            same length as `segments`.
-        base_offset: The expected segment base offset from the extended header.
-            Should point to the aligned offset following the end of
-            `program_data`.
-    Returns:
-        A copy of `program_data` with the segment data and padding appended.
-        If there are no segments, returns `program_data` directly.
-    Raises:
-        ValueError: If the length of `segments` doesn't match the length of
-            `segment_table`.
-    """
-    if len(segments) != len(segment_table):
-        raise ValueError(
-            f"Segments length {len(segments)} does not match "
-            + f"segment_table length {len(segment_table)}"
-        )
-    if not segments:
-        return program_data
-
-    # The pieces that will be concatenated to create the output data.
-    # `program_data` will be its first element.
-    padded_segments: List[bytes] = []
-    # Length of all elements in padded_segments. Only used for assertions.
-    current_offset: int = 0
-    for i, segment in enumerate([program_data] + segments):
-        # Add padding if necessary to align the start of this segment.
-        pad_length: int = _padding_required(current_offset, alignment)
-        if pad_length > 0:
-            padded_segments.append(b"\x00" * pad_length)
-            current_offset += pad_length
-
-        # Make sure that we're about to add this segment to the offset that
-        # agrees with program.segments. Skip the first entry, which is the
-        # Program itself and isn't included in program.segments.
-        if i == 1:
-            # The first real segment should start at the base offset.
-            assert current_offset == base_offset, (
-                f"Offset of first segment {current_offset} "
-                + f"!= base_offset {base_offset}"
-            )
-        if i > 0:
-            # Adding a real segment, not `program_data`.
-            expected_segment = segment_table[i - 1]
-            expected_offset = base_offset + expected_segment.offset
-            assert current_offset == expected_offset, (
-                f"Segment {i} offset {current_offset} "
-                + f"!= expected offset {expected_offset} "
-                + f"(base {base_offset} + {expected_segment.offset}) "
-            )
-            assert expected_segment.size == len(segment), (
-                f"Segment {i} size {len(segment)} "
-                + f"!= expected size {expected_segment.size}"
-            )
-
-        # Add the payload. If this is the final segment, it does not need
-        # padding after it.
-        padded_segments.append(segment)
-        current_offset += len(segment)
-
-    # Use join() instead of appending to avoid O(n) reallocation of these
-    # potentially-large buffers.
-    return b"".join(padded_segments)
+    return constant_segment_data, constant_segment_offsets
 
 
 def serialize_pte_binary(
@@ -508,7 +346,7 @@ def serialize_pte_binary(
     segment_alignment: int = 4096,
     constant_tensor_alignment: Optional[int] = None,
     delegate_alignment: Optional[int] = None,
-) -> bytes:
+) -> Cord:
     """Returns the runtime binary representation of the given Program.
 
     Args:
@@ -524,9 +362,8 @@ def serialize_pte_binary(
             into a separate segment.
         segment_alignment: Alignment in bytes. The starting offset of each
             segment will be aligned to this value in the output data.
-        constant_tensor_alignment: If provided, the minimum alignment of tensor
-            buffers in the program. Must be a power of 2. If not provided, uses
-            the value in the schema file.
+        constant_tensor_alignment: The minimum alignment of tensor
+            buffers in the program. Must be a power of 2. Defaults to ALIGNMENT.
         delegate_alignment: If provided, the minimum alignment of delegate data
             in the program. Must be a power of 2. If not provided, uses the
             value in the schema file.
@@ -535,20 +372,53 @@ def serialize_pte_binary(
     """
     # Default tensor alignment.
     if constant_tensor_alignment is None:
-        constant_tensor_alignment = 16
+        constant_tensor_alignment = ALIGNMENT
 
-    # Segment data to be written to the file following the flatbuffer data.
-    segments: List[bytes] = []
+    # Don't modify the original program.
+    # TODO(T144120904): Could avoid yet more huge copies with a more shallow
+    # copy, reusing the actual data blobs.
+    program = copy.deepcopy(program)
+
+    # Store extracted segment data; this may be constant data or delegate data.
+    segments: List[Cord] = []
+
+    if extract_constant_segment:
+        constant_segment_data, constant_segment_offsets = _extract_constant_segment(
+            program.constant_buffer, tensor_alignment=constant_tensor_alignment
+        )
+        if len(constant_segment_data) > 0:
+            # Update program.constant_segment with constant subsegment offset information.
+            program.constant_segment = SubsegmentOffsets(
+                segment_index=len(segments), offsets=constant_segment_offsets
+            )
+            # Clear the constant buffer, as constant data will be stored in segments.
+            program.constant_buffer = []
+            # Add to the aggregate segments cord.
+            segments.append(constant_segment_data)
 
-    # Extract constant segment and delegate segments, if requested.
-    if extract_constant_segment or extract_delegate_segments:
-        program, segments = _extract_segments(
-            program=program,
-            extract_delegate_segments=extract_delegate_segments,
-            extract_constant_segment=extract_constant_segment,
-            segment_alignment=segment_alignment,
-            constant_tensor_alignment=constant_tensor_alignment,
+    if extract_delegate_segments:
+        _extract_delegate_segments(program, segments)
+
+    # Append all segments into a single Cord, adding any necessary padding to ensure that
+    # each segment begins at the required alignment.
+    # Update program.segments with the offsets to each segment.
+    segments_data = Cord()
+    for data in segments:
+        prev_end = (
+            (program.segments[-1].offset + program.segments[-1].size)
+            if program.segments
+            else 0
+        )
+        program.segments.append(
+            DataSegment(
+                offset=_aligned_size(prev_end, segment_alignment), size=len(data)
+            )
         )
+        # Add to aggregate segments cord with padding.
+        padding_length = _padding_required(len(segments_data), segment_alignment)
+        if padding_length > 0:
+            segments_data.append(b"\x00" * padding_length)
+        segments_data.append(data)
 
     # Convert to a standard flatbuffer binary.
     result: _FlatbufferResult = _program_json_to_flatbuffer(
@@ -558,8 +428,8 @@ def serialize_pte_binary(
     )
 
     # If there are no segments present, do not insert the extended header.
-    if not segments:
-        return result.data
+    if len(segments_data) == 0:
+        return Cord(result.data)
 
     # Size of the header to insert. Its size is padded to the largest
     # force_align value present in the schema.
@@ -572,7 +442,7 @@ def serialize_pte_binary(
     # Offset to the first segment, or zero if there are no segments.
     segment_base_offset: int = (
         _aligned_size(input_size=program_size, alignment=segment_alignment)
-        if segments
+        if len(segments_data) > 0
         else 0
     )
 
@@ -600,18 +470,19 @@ def serialize_pte_binary(
     assert eh.program_size == program_size
     assert eh.segment_base_offset == segment_base_offset
 
-    if segments:
-        # Add segments to the end of the data, in order, with the appropriate
-        # padding.
-        program_data = _append_segments(
-            program_data=program_data,
-            segments=segments,
-            alignment=segment_alignment,
-            segment_table=program.segments,
-            base_offset=segment_base_offset,
-        )
-
-    return program_data
+    # Construct the final pte file containing:
+    # - program data; written to offset 0.
+    # - segments data (optional); aligned to segment_alignment.
+    pte_data = Cord(program_data)
+    if len(segments_data) > 0:
+        padding_length = _padding_required(len(pte_data), segment_alignment)
+        pte_data.append(b"\x00" * padding_length)
+        # The first segment after program data should start at the segment base offset.
+        assert (
+            len(pte_data) == segment_base_offset
+        ), f"Offset of first segment {len(pte_data)} != segment base offset {segment_base_offset}"
+        pte_data.append(segments_data)
+    return pte_data
 
 
 def _restore_segments(program: Program, segment_data: bytes) -> Program:
diff --git a/exir/_serialize/test/TARGETS b/exir/_serialize/test/TARGETS
index 682f03f0f1..853d82b8a9 100644
--- a/exir/_serialize/test/TARGETS
+++ b/exir/_serialize/test/TARGETS
@@ -23,3 +23,13 @@ python_unittest(
         "//executorch/exir/_serialize:lib",
     ],
 )
+
+python_unittest(
+    name = "cord",
+    srcs = [
+        "test_cord.py",
+    ],
+    deps = [
+        "//executorch/exir/_serialize:lib",
+    ],
+)
diff --git a/exir/_serialize/test/test_cord.py b/exir/_serialize/test/test_cord.py
new file mode 100644
index 0000000000..d6c60255f5
--- /dev/null
+++ b/exir/_serialize/test/test_cord.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import io
+import unittest
+
+from executorch.exir._serialize._cord import Cord
+
+
+class TestCord(unittest.TestCase):
+    def test_cord_init(self) -> None:
+        cord_empty = Cord()
+        self.assertEqual(0, len(cord_empty))
+
+        cord = Cord(b"HelloWorld")
+        self.assertEqual(10, len(cord))
+        self.assertEqual(b"HelloWorld", bytes(cord))
+
+        cord2 = Cord(cord)
+        self.assertEqual(10, len(cord2))
+        self.assertEqual(b"HelloWorld", bytes(cord))
+
+        # Confirm no copies were made.
+        self.assertEqual(id(cord._buffers[0]), id(cord2._buffers[0]))
+
+    def test_cord_append(self) -> None:
+        cord = Cord()
+        cord.append(b"Hello")
+        self.assertEqual(5, len(cord))
+        self.assertEqual(b"Hello", bytes(cord))
+
+        cord.append(b"World")
+        self.assertEqual(10, len(cord))
+        self.assertEqual(b"HelloWorld", bytes(cord))
+
+    def test_cord_append_cord(self) -> None:
+        cord = Cord()
+        cord.append(b"Hello")
+        cord.append((b"World"))
+
+        cord2 = Cord()
+        cord2.append(b"Prefix")
+        cord2.append(cord)
+
+        self.assertEqual(16, len(cord2))
+        self.assertEqual(b"PrefixHelloWorld", bytes(cord2))
+
+        # Confirm that no copies were made when appending a Cord.
+        self.assertEqual(id(cord2._buffers[1]), id(cord._buffers[0]))
+        self.assertEqual(id(cord2._buffers[2]), id(cord._buffers[1]))
+
+    def test_cord_write_to_file(self) -> None:
+        cord = Cord()
+        cord.append(b"Hello")
+        cord.append(b"World")
+
+        outfile = io.BytesIO()
+        cord.write_to_file(outfile)
+        self.assertEqual(b"HelloWorld", outfile.getvalue())
diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py
index 94cd40f624..54f8c7b622 100644
--- a/exir/_serialize/test/test_program.py
+++ b/exir/_serialize/test/test_program.py
@@ -164,11 +164,13 @@ def constant_segment_with_tensor_alignment(
         add_constant_data(program, blobs)
 
         # Extract blobs into constant segment during serialization.
-        pte_data = serialize_pte_binary(
-            program,
-            extract_constant_segment=True,
-            segment_alignment=SEGMENT_ALIGNMENT,
-            constant_tensor_alignment=constant_tensor_alignment,
+        pte_data = bytes(
+            serialize_pte_binary(
+                program,
+                extract_constant_segment=True,
+                segment_alignment=SEGMENT_ALIGNMENT,
+                constant_tensor_alignment=constant_tensor_alignment,
+            )
         )
 
         # The input Program should not be modified.
@@ -395,7 +397,7 @@ def test_round_trip_no_header_no_segments(self) -> None:
         deserializing.
         """
         program = get_test_program()
-        pte_data = serialize_pte_binary(program)
+        pte_data = bytes(serialize_pte_binary(program))
         self.assertGreater(len(pte_data), 16)
 
         # File magic should be present at the expected offset.
@@ -418,7 +420,7 @@ def test_round_trip_large_buffer_sizes(self) -> None:
         """
         program = get_test_program()
         program.execution_plan[0].non_const_buffer_sizes = [0, 2**48]
-        flatbuffer_from_py = serialize_pte_binary(program)
+        flatbuffer_from_py = bytes(serialize_pte_binary(program))
         self.assert_programs_equal(program, deserialize_pte_binary(flatbuffer_from_py))
 
     def test_round_trip_no_segments_and_no_header(self) -> None:
@@ -428,8 +430,10 @@ def test_round_trip_no_segments_and_no_header(self) -> None:
         that a Program remains the same after serializing and deserializing.
         """
         program = get_test_program()
-        pte_data = serialize_pte_binary(
-            program, extract_delegate_segments=True, extract_constant_segment=True
+        pte_data = bytes(
+            serialize_pte_binary(
+                program, extract_delegate_segments=True, extract_constant_segment=True
+            )
         )
         self.assertGreater(len(pte_data), 16)
 
@@ -461,6 +465,7 @@ def gen_blob_data(size: int, pattern: bytes) -> bytes:
         assert len(ret) == size
         return ret
 
+    @unittest.skip("TODO(T181362263): Update restore segments to restore cords")
     def test_round_trip_with_segments(self) -> None:
         # Create a program with some delegate data blobs.
         program = get_test_program()
@@ -476,8 +481,12 @@ def test_round_trip_with_segments(self) -> None:
         add_delegate_data(program, program.execution_plan[0], blobs)
 
         # Extract the blobs into segments during serialization.
-        pte_data = serialize_pte_binary(
-            program, extract_delegate_segments=True, segment_alignment=SEGMENT_ALIGNMENT
+        pte_data = bytes(
+            serialize_pte_binary(
+                program,
+                extract_delegate_segments=True,
+                segment_alignment=SEGMENT_ALIGNMENT,
+            )
         )
 
         # The input Program should not have been modified.
@@ -587,8 +596,12 @@ def test_unused_inline_delegate_blobs_with_segments(self) -> None:
         add_delegate_data(program, program.execution_plan[0], blobs)
 
         # Extract the blobs into segments should succeeed.
-        pte_data = serialize_pte_binary(
-            program, extract_delegate_segments=True, segment_alignment=SEGMENT_ALIGNMENT
+        pte_data = bytes(
+            serialize_pte_binary(
+                program,
+                extract_delegate_segments=True,
+                segment_alignment=SEGMENT_ALIGNMENT,
+            )
         )
         self.assertGreater(len(pte_data), 16)
 
@@ -643,12 +656,14 @@ def test_constant_segment_and_delegate_segment(self) -> None:
         add_delegate_data(program, program.execution_plan[0], delegate_blobs)
 
         # Extract the blobs into segments during serialization.
-        pte_data = serialize_pte_binary(
-            program,
-            extract_delegate_segments=True,
-            extract_constant_segment=True,
-            segment_alignment=SEGMENT_ALIGNMENT,
-            constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT,
+        pte_data = bytes(
+            serialize_pte_binary(
+                program,
+                extract_delegate_segments=True,
+                extract_constant_segment=True,
+                segment_alignment=SEGMENT_ALIGNMENT,
+                constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT,
+            )
         )
 
         # The input Program should not be modified.
diff --git a/exir/backend/test/test_backends.py b/exir/backend/test/test_backends.py
index 95f63f44f3..db3e806f13 100644
--- a/exir/backend/test/test_backends.py
+++ b/exir/backend/test/test_backends.py
@@ -99,7 +99,7 @@ def check_delegate_input(
         self, delegate: LoweredBackendModule, input_len: int
     ) -> None:
         counter = 0
-        for node in delegate._original_module.graph.nodes:
+        for node in delegate.original_module.graph.nodes:
             if node.op == "placeholder":
                 counter += 1
         self.assertEqual(counter, input_len)
diff --git a/exir/backend/test/test_backends_lifted.py b/exir/backend/test/test_backends_lifted.py
index 905ce1a7f2..e219198027 100644
--- a/exir/backend/test/test_backends_lifted.py
+++ b/exir/backend/test/test_backends_lifted.py
@@ -98,7 +98,7 @@ def check_delegate_input(
         self, delegate: LoweredBackendModule, input_len: int
     ) -> None:
         counter = 0
-        for node in delegate._original_module.graph.nodes:
+        for node in delegate.original_module.graph.nodes:
             if node.op == "placeholder":
                 counter += 1
         self.assertEqual(counter, input_len)
@@ -913,7 +913,7 @@ def forward(self, x, y):
         )
         self.assertEqual(len(lowered_backends), 2)
         for backend in lowered_backends:
-            original_program = backend._original_module
+            original_program = backend.original_module
             # check that program has the lowered attributes
             self.assertEqual(len(original_program.state_dict), 1)
             # check backend has one placeholder input one placeholder parameter
@@ -1012,17 +1012,19 @@ def false_fn(x, y):
             x = x - y
             return x
 
-        def f(x, y):
-            x = x + y
-            x = control_flow.cond(x[0][0] == 1, true_fn, false_fn, [x, y])
-            x = x - y
-            return x
+        class Module(torch.nn.Module):
+            def forward(self, x, y):
+                x = x + y
+                x = control_flow.cond(x[0][0] == 1, true_fn, false_fn, [x, y])
+                x = x - y
+                return x
 
+        f = Module()
         inputs = (torch.ones(2, 2), torch.ones(2, 2))
         orig_res = f(*inputs)
         orig = to_edge(
             export(
-                torch.export.WrapperModule(f),
+                f,
                 inputs,
             )
         )
@@ -1066,15 +1068,17 @@ def map_fn(x, y):
             x = x + y
             return x
 
-        def f(xs, y):
-            y = torch.mm(y, y)
-            return control_flow.map(map_fn, xs, y)
+        class Module(torch.nn.Module):
+            def forward(self, xs, y):
+                y = torch.mm(y, y)
+                return control_flow.map(map_fn, xs, y)
 
+        f = Module()
         inputs = (torch.ones(2, 2), torch.ones(2, 2))
         orig_res = f(*inputs)
         orig = to_edge(
             export(
-                torch.export.WrapperModule(f),
+                f,
                 inputs,
             )
         )
@@ -1132,9 +1136,10 @@ def map_fn(x, pred1, pred2, y):
             x = x + y
             return x.sin()
 
-        def f(xs, pred1, pred2, y):
-            y = torch.mm(y, y)
-            return control_flow.map(map_fn, xs, pred1, pred2, y)
+        class Module(torch.nn.Module):
+            def forward(self, xs, pred1, pred2, y):
+                y = torch.mm(y, y)
+                return control_flow.map(map_fn, xs, pred1, pred2, y)
 
         inputs = (
             torch.ones(2, 2),
@@ -1143,10 +1148,11 @@ def f(xs, pred1, pred2, y):
             torch.ones(2, 2),
         )
 
+        f = Module()
         orig_res = f(*inputs)
         orig = to_edge(
             export(
-                torch.export.WrapperModule(f),
+                f,
                 inputs,
             )
         )
@@ -1205,12 +1211,14 @@ def f(xs, pred1, pred2, y):
         )
 
     def test_list_input(self):
-        def f(x: List[torch.Tensor]):
-            y = x[0] + x[1]
-            return y
+        class Module(torch.nn.Module):
+            def forward(self, x: List[torch.Tensor]):
+                y = x[0] + x[1]
+                return y
 
+        f = Module()
         inputs = ([torch.randn(2, 2), torch.randn(2, 2)],)
-        edge_prog = to_edge(export(torch.export.WrapperModule(f), inputs))
+        edge_prog = to_edge(export(f, inputs))
         lowered_gm = to_backend(
             BackendWithCompilerDemo.__name__, edge_prog.exported_program(), []
         )
@@ -1227,12 +1235,14 @@ def forward(self, x: List[torch.Tensor]):
         gm.exported_program().module()(*inputs)
 
     def test_dict_input(self):
-        def f(x: Dict[str, torch.Tensor]):
-            y = x["a"] + x["b"]
-            return y
+        class Module(torch.nn.Module):
+            def forward(self, x: Dict[str, torch.Tensor]):
+                y = x["a"] + x["b"]
+                return y
 
+        f = Module()
         inputs = ({"a": torch.randn(2, 2), "b": torch.randn(2, 2)},)
-        edge_prog = to_edge(export(torch.export.WrapperModule(f), inputs))
+        edge_prog = to_edge(export(f, inputs))
         lowered_gm = to_backend(
             BackendWithCompilerDemo.__name__, edge_prog.exported_program(), []
         )
diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py
index 6e397bf3bb..b5cc0f6bcd 100644
--- a/exir/backend/test/test_partitioner.py
+++ b/exir/backend/test/test_partitioner.py
@@ -172,6 +172,6 @@ def partition(
 
         with self.assertRaisesRegex(
             RuntimeError,
-            "placeholder node for non-params and non-buffer should not be tagged",
+            "placeholder node for non-params, non-buffer, and non-tensor constants should not be tagged",
         ):
             _ = edge.to_backend(PartitionerTagInput())
diff --git a/exir/capture/__init__.py b/exir/capture/__init__.py
index 2ef711b595..b6304b86a2 100644
--- a/exir/capture/__init__.py
+++ b/exir/capture/__init__.py
@@ -10,7 +10,6 @@
     _capture_legacy_do_not_use,
     CallSpec,
     capture,
-    capture_multiple,
 )
 
 from executorch.exir.capture._config import (
@@ -23,7 +22,6 @@
     "CallSpec",
     "capture",
     "_capture_legacy_do_not_use",
-    "capture_multiple",
     "CaptureConfig",
     "EdgeCompileConfig",
     "ExecutorchBackendConfig",
diff --git a/exir/capture/_capture.py b/exir/capture/_capture.py
index b9e051f2cb..19947d80d9 100644
--- a/exir/capture/_capture.py
+++ b/exir/capture/_capture.py
@@ -9,12 +9,12 @@
 from collections import namedtuple
 from contextlib import contextmanager
 from types import MethodType
-from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Callable, cast, List, Optional, Tuple
 
 import torch
 from executorch.exir.capture._config import CaptureConfig
 from executorch.exir.error import ExportError, ExportErrorType, InternalError
-from executorch.exir.program import ExirExportedProgram, MultiMethodExirExportedProgram
+from executorch.exir.program import ExirExportedProgram
 from executorch.exir.program._program import _transform, HackedUpExportedProgramDONOTUSE
 from executorch.exir.tracer import (
     _default_decomposition_table,
@@ -360,137 +360,6 @@ def convert_to_fake(x):
     return ExirExportedProgram(ep, False)
 
 
-@compatibility(is_backward_compatible=False)
-def capture_multiple(
-    m: Union[torch.nn.Module, Callable[..., Any]],
-    args: Union[Dict[str, Tuple[Value, ...]], Tuple[Value, ...]],
-    config: Optional[CaptureConfig] = None,
-    prim_getters: Optional[Set[str]] = None,
-    dynamic_shapes: Optional[Union[Dict[str, Any], List[Any]]] = None,
-):
-    """
-    capture_multiple traces either an nn.Module or just a callable with PyTorch
-    operations inside and produce a single MultiMethodExirExportedProgram that
-    can potentially have multiple entry points. When multiple entry points
-    are traced, each of them is stored separately in the resulting
-    MultiMethodExirExportedProgram without sharing state.
-
-    Args:
-        m: the `nn.Module` or callable to trace.
-
-        args: Tracing example inputs.
-
-        When `m` is an nn.Module, `args` can be
-        1) A dictionary that maps names of method to their tracing example inputs.
-        in this case, all specified methods will be captured.
-        2) A tuple. In this case, `forward` method of `m` will be captured. It is
-        equivalent to passing {"forward", tuple-type-args}
-
-        When `m` is a non-Module callable, `args` must be a Tuple containing
-        tracing example inputs.
-
-        config: A CaptureConfig object that specifies how to interpret the
-        program being captured.
-
-        prim_getters: A set of primitive getter functions to capture the return values of
-
-        dynamic_shapes: Input dynamic shapes.
-
-        When `m` is an nn.Module, `dynamic_shapes` is a dictionary that maps names of method
-        to their input dynamic shapes.
-
-        When `m` is a non-Module callable, `dynamic_shapes` is a list of input dynamic shapes.
-
-    Returns:
-        A MultiMethodExirExportedProgram.
-
-        if `m` is an nn.Module, returned program would have multiple
-        captured methods, each corresponding to one entry in args dictionary.
-
-        if `m` is a non-Module callable, returned program would have a single
-        captured method named `forward`.
-
-    Raises:
-        AssertionError if given method name do not reference a valid method
-        on the given nn.Module.
-    """
-    warnings.warn(
-        "This function is now deprecated, please use `torch.export and exir.to_edge` instead.",
-        DeprecationWarning,
-        stacklevel=1,
-    )
-    # Normalize m and args.
-    compile_specs = []
-    prim_getter_cache: Optional[Dict[str, Any]] = None
-    if isinstance(m, torch.nn.Module):
-        if dynamic_shapes is not None:
-            assert isinstance(
-                dynamic_shapes, dict
-            ), f"Expected a dict for dynamic_shapes, got {type(dynamic_shapes)}"
-
-        if isinstance(args, tuple):
-            compile_specs.append(
-                CompileSpec(
-                    "forward",
-                    m.forward,
-                    args,
-                    (
-                        dynamic_shapes["forward"]
-                        if dynamic_shapes and "forward" in dynamic_shapes
-                        else None
-                    ),
-                )
-            )
-        else:
-            assert isinstance(
-                args, dict
-            ), f"Expected a tuple or Dict[str, tuple], got {type(args)}"
-            for method_name, method_args in args.items():
-                compile_specs.append(
-                    CompileSpec(
-                        method_name,
-                        getattr(m, method_name),
-                        method_args,
-                        (
-                            dynamic_shapes[method_name]
-                            if dynamic_shapes and method_name in dynamic_shapes
-                            else None
-                        ),
-                    )
-                )
-        if prim_getters is not None:
-            prim_getter_cache = {}
-            for getter in prim_getters:
-                prim_getter_cache[getter] = getattr(m, getter)()
-    else:
-        # Reaching here means `m` is a non-Module callable.
-        assert isinstance(
-            m, Callable
-        ), f"Only nn.Module or callable allowed, got {type(m)}"
-        assert isinstance(
-            args, tuple
-        ), f"When tracing a non-Module callable, `args` must be a tuple of tracing inputs, but got {type(args)}"
-        assert (
-            prim_getters is None
-        ), "Caller should not specify primitive getter functions when only providing a callable as input"
-        if dynamic_shapes is not None:
-            assert isinstance(
-                dynamic_shapes, list
-            ), f"Expected a list for constraints, got {type(dynamic_shapes)}"
-        compile_specs.append(CompileSpec("forward", m, args, dynamic_shapes))
-
-    method_name_to_prog = {}
-    for compile_spec in compile_specs:
-        method_name_to_prog[compile_spec.method_name] = capture(
-            compile_spec.callable,
-            compile_spec.args,
-            config,
-            compile_spec.dynamic_shapes,
-        )
-
-    return MultiMethodExirExportedProgram(method_name_to_prog, prim_getter_cache)
-
-
 # This is to bootstrap the missing meta["val"] when 1. ph consists of scalar
 # 2. meta["val"] is not properly set in dispatch_trace.
 def _instantiate_missing_placeholder_val_with_real_inputs(gm, args):
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index a22cac53e4..fc3e446af9 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -32,7 +32,8 @@
 )
 from executorch.exir.tensor import layout_enum, scalar_type_enum
 from executorch.exir.version import EXECUTORCH_SCHEMA_VERSION
-from torch.export.exported_program import ExportedProgram
+from torch.export.exported_program import ExportedProgram, OutputKind
+from torch.utils import _pytree as pytree
 
 
 def _emit_prim_getters(prim_getters: Dict[str, Any]) -> List[ExecutionPlan]:
@@ -122,6 +123,36 @@ class EmitterOutput:
     ]
 
 
+def _remove_non_user_outputs(exported_program: ExportedProgram) -> torch.fx.GraphModule:
+    gm = exported_program.graph_module
+    output_node = None
+    for node in gm.graph.nodes:
+        if node.op == "output":
+            output_node = node
+    assert output_node is not None
+
+    mutated_outputs: List[Optional[str]] = [
+        out_spec.target if out_spec.kind in (OutputKind.BUFFER_MUTATION,) else None
+        for out_spec in exported_program.graph_signature.output_specs
+    ]
+    outputs = pytree.tree_flatten(output_node.args)[0]
+
+    user_output_nodes = []
+    for return_node, mutated_node_name in zip(outputs, mutated_outputs):
+        if mutated_node_name is None:
+            user_output_nodes.append(return_node)
+            continue
+
+    with gm.graph.inserting_before(output_node):
+        # Only return user outputs
+        new_output = gm.graph.output(tuple(user_output_nodes))
+        new_output.meta = output_node.meta.copy()
+        output_node.replace_all_uses_with(new_output)
+        gm.graph.erase_node(output_node)
+
+    return gm
+
+
 def emit_program(
     methods: Union[ExportedProgram, Dict[str, ExportedProgram]],
     emit_stacktrace: bool = False,
@@ -163,13 +194,6 @@ def emit_program(
 
     # emit each entry point in order according to name.
     for name, exported_program in sorted(methods.items()):
-        if (
-            exported_program.graph_signature.buffers_to_mutate
-        ):  # see if we are mutating any state
-            raise ExportError(
-                ExportErrorType.INVALID_INPUT_TYPE,
-                "Buffers cannot be modified in executorch.",
-            )
         # create empty state
         emitter_state = _EmitterState(
             values=[],
@@ -180,7 +204,11 @@ def emit_program(
             emit_stacktrace=emit_stacktrace,
         )
 
-        emitter = _TopLevelEmitter(name, exported_program, program_state, emitter_state)
+        gm = _remove_non_user_outputs(exported_program)
+
+        emitter = _TopLevelEmitter(
+            name, exported_program, gm, program_state, emitter_state
+        )
 
         emitter.run()
         plans.append(emitter.plan())
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index e3e655a1bf..af5614bf20 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -32,8 +32,9 @@
 import hashlib
 import operator
 import typing
+import warnings
 from dataclasses import dataclass, field
-from typing import Callable, cast, Dict, List, Mapping, Optional, Tuple, Union
+from typing import Any, Callable, cast, Dict, List, Mapping, Optional, Tuple, Union
 
 import executorch.exir.memory as memory
 import executorch.extension.pytree as ex_pytree
@@ -462,14 +463,15 @@ def _constant_to_evalue(  # noqa: C901
             return EValue(Int(layout_enum(val)))
 
         if isinstance(val, torch.memory_format):
-            if val != torch.contiguous_format:
+            try:
+                return EValue(Int(memory_format_enum(val)))
+            except KeyError:
                 raise InternalError(
                     self._emit_node_specific_error(
                         self.node,
-                        "Non contiguous tensors are not supported in ExecuTorch",
+                        f"Tensor has a memory_format that is unsupported in ExecuTorch: {val}",
                     )
                 )
-            return EValue(Int(memory_format_enum(val)))
 
         if isinstance(val, torch.Tensor):
             raise ExportError(
@@ -1266,15 +1268,17 @@ def __init__(
         self,
         name: str,
         exported_program: ExportedProgram,
+        graph_module: torch.fx.GraphModule,
         program_state: _ProgramState,
         emitter_state: _EmitterState,
     ) -> None:
-        super().__init__(exported_program.graph_module, emitter_state, program_state)
+        super().__init__(graph_module, emitter_state, program_state)
         self.name = name
         self.exported_program = exported_program
 
         self.inputs: List[int] = []
         self.outputs: List[int] = []
+        self.given_mutable_buffer_warning = False
 
         def create_container_str(spec: Optional[pytree.TreeSpec]) -> str:
             if spec is None:
@@ -1293,6 +1297,42 @@ def create_container_str(spec: Optional[pytree.TreeSpec]) -> str:
             inp_container_str, out_container_str
         )
 
+    def _find_fqn_for_placeholder(
+        self, target: _Target, spec: Any  # pyre-ignore[2]
+    ) -> Tuple[Optional[str], bool]:
+        # Find the fully qualified name
+        fqn = None
+        is_mutable_buffer = False
+        if target in self.exported_program.graph_signature.inputs_to_parameters:
+            fqn = self.exported_program.graph_signature.inputs_to_parameters[target]
+
+        elif target in self.exported_program.graph_signature.inputs_to_buffers:
+            fqn = self.exported_program.graph_signature.inputs_to_buffers[target]
+
+            # if the buffer is mutated then record that
+            if fqn in self.exported_program.graph_signature.buffers_to_mutate.values():
+                is_mutable_buffer = True
+                if not self.given_mutable_buffer_warning:
+                    warnings.warn(
+                        "Mutation on a buffer in the model is detected. ExecuTorch assumes "
+                        "buffers that are mutated in the graph have a meaningless initial state, "
+                        "only the shape and dtype will be serialized.",
+                        UserWarning,
+                        stacklevel=1,
+                    )
+                    self.given_mutable_buffer_warning = True
+
+        elif (
+            target
+            in self.exported_program.graph_signature.inputs_to_lifted_tensor_constants
+        ):
+            fqn = (
+                self.exported_program.graph_signature.inputs_to_lifted_tensor_constants[
+                    target
+                ]
+            )
+        return fqn, is_mutable_buffer
+
     def placeholder(
         self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument]
     ) -> _AbstractValue:
@@ -1302,40 +1342,27 @@ def placeholder(
         https://pytorch.org/docs/stable/fx.html#torch.fx.Graph.placeholder
         """
         spec = self.node.meta["spec"]
-        const_tensor = False
-        if isinstance(target, str) and (
-            target in self.exported_program.graph_signature.inputs_to_parameters
-            or target in self.exported_program.graph_signature.inputs_to_buffers
-            or target
-            in self.exported_program.graph_signature.inputs_to_lifted_tensor_constants
-        ):
-            if (
-                target
-                in self.exported_program.graph_signature.inputs_to_lifted_tensor_constants
-            ):
-                fqn = self.exported_program.graph_signature.inputs_to_lifted_tensor_constants[
-                    target
-                ]
-            elif target in self.exported_program.graph_signature.inputs_to_buffers:
-                fqn = self.exported_program.graph_signature.inputs_to_buffers[target]
-            else:
-                fqn = self.exported_program.graph_signature.inputs_to_parameters[target]
+        is_user_input = True
+
+        if isinstance(target, str) and isinstance(spec, TensorSpec):
+
+            fqn, is_mutable_buffer = self._find_fqn_for_placeholder(target, spec)
+
+            # From the fqn find the corresponding tensor
+            real_tensor = None
             if fqn in self.exported_program.state_dict:
-                spec = TensorSpec.from_tensor(
-                    self.exported_program.state_dict[fqn], const=True
-                )
-                const_tensor = True
+                real_tensor = self.exported_program.state_dict[fqn]
+                is_user_input = False
+
             elif fqn in self.exported_program.constants:
-                spec = TensorSpec.from_tensor(
-                    self.exported_program.constants[fqn], const=True
-                )
-                const_tensor = True
-            else:
+                real_tensor = self.exported_program.constants[fqn]
+                is_user_input = False
+            elif fqn is not None:
                 buffers = self.exported_program.named_buffers()
                 buf = next((x[1] for x in buffers if x[0] == fqn), None)
                 if buf is not None:
-                    spec = TensorSpec.from_tensor(buf, const=True)
-                    const_tensor = True
+                    real_tensor = buf
+                    is_user_input = False
                 else:
                     raise InternalError(
                         self._emit_node_specific_error(
@@ -1344,13 +1371,28 @@ def placeholder(
                         )
                     )
 
+            # assign the storage of the placeholder spec to the storage of the real tensor if there is one
+            if real_tensor is not None:
+                # for non-contigous tensors, convert to a contiguous one
+                real_tensor = real_tensor.contiguous()
+                # Weights cannot be views during emission or serialization
+                if real_tensor.nbytes != real_tensor.untyped_storage().nbytes():
+                    real_tensor = real_tensor.clone()
+
+                spec.storage = real_tensor.untyped_storage()
+
+            # User inputs and mutable buffers are not constants, other buffers or parameters are.
+            spec.const = not (is_user_input or is_mutable_buffer)
+
         evalue = (
             self._tensor_spec_to_evalue(spec)
             if isinstance(spec, TensorSpec)
             else self._constant_to_evalue(spec, None)
         )
         value = self._emit_evalue(evalue)
-        if not const_tensor:
+
+        # Only user inputs should remain as inputs.
+        if is_user_input:
             self.inputs.append(value.id)
 
         return value
@@ -1367,9 +1409,10 @@ def output(
             self.outputs.append(args_tuple.id)
         else:
             for arg in args_tuple:
-                if isinstance(arg, (int, float, bool)):
+                if isinstance(arg, (int, float, bool, type(None))):
                     arg = self._emit_evalue(self._constant_to_evalue(arg, None))
-                elif isinstance(arg, (type(None), str)):
+                elif isinstance(arg, str):
+                    # TODO(jackkhuu): T181599879 Add support for string outputs IFF compiler supports
                     raise InternalError(
                         self._emit_node_specific_error(
                             self.node,
diff --git a/exir/emit/test/TARGETS b/exir/emit/test/TARGETS
index 4fb30b220b..06119a696e 100644
--- a/exir/emit/test/TARGETS
+++ b/exir/emit/test/TARGETS
@@ -21,6 +21,6 @@ python_unittest(
         "//executorch/exir/passes:constant_prop_pass",
         "//executorch/exir/tests:lib",
         "//executorch/exir/tests:models",
-        "//executorch/extension/pybindings:portable_lib",  # @manual
+        "//executorch/extension/pybindings:aten_lib",
     ],
 )
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index cc7022f0d6..7ee4a05947 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -16,11 +16,18 @@
 import executorch.exir.schema as schema
 import executorch.exir.tests.models as models
 import torch
-from executorch.exir import EdgeCompileConfig, ExecutorchProgramManager, to_edge
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchBackendConfig,
+    ExecutorchProgramManager,
+    to_edge,
+)
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.emit import emit_program  # noqa
-from executorch.exir.passes.constant_prop_pass import constant_prop_pass
+from executorch.exir.error import InternalError
+from executorch.exir.passes import MemoryPlanningPass
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.exir.print_program import pretty_print, print_program  # noqa
 from executorch.exir.schema import (
@@ -42,6 +49,7 @@
 )
 from executorch.exir.tests.common import register_additional_test_aten_ops
 from executorch.exir.tests.models import Mul
+from executorch.extension.pybindings.aten_lib import _load_for_executorch_from_buffer
 from functorch.experimental import control_flow
 from torch import nn
 
@@ -212,14 +220,14 @@ def forward(
     def test_constant_output(self):
         class M(torch.nn.Module):
             def forward(self, x):
-                return [((1, 3, 1.2), True, [x + x, x * x])]
+                return [((1, 3, 1.2), True, [x + x, x * x], None)]
 
         ep = torch.export.export(M(), (torch.ones(2, 3),))
-        res = ep(torch.ones(2, 3))
+        res = ep.module()(torch.ones(2, 3))
         self.assertEqual(res[0][0], (1, 3, 1.2))
         program = to_edge(ep).to_executorch().executorch_program
         outputs = program.execution_plan[0].outputs
-        self.assertEqual(len(outputs), 6)
+        self.assertEqual(len(outputs), 7)
         self.assertEqual(program.execution_plan[0].values[outputs[0]].val.int_val, 1)
         self.assertEqual(program.execution_plan[0].values[outputs[1]].val.int_val, 3)
         self.assertEqual(
@@ -228,6 +236,7 @@ def forward(self, x):
         self.assertEqual(
             program.execution_plan[0].values[outputs[3]].val.bool_val, True
         )
+        self.assertIsInstance(program.execution_plan[0].values[outputs[6]].val, Null)
 
     def test_int_list_input(self):
         class M(torch.nn.Module):
@@ -235,51 +244,13 @@ def forward(self, x, y, z):
                 return x + y, x + x, x + y + z
 
         ep = torch.export.export(M(), (torch.ones(2, 3), 2, True))
-        ep(torch.ones(2, 3), 2, True)
+        ep.module()(torch.ones(2, 3), 2, True)
         program = to_edge(ep).to_executorch().executorch_program
         inputs = program.execution_plan[0].inputs
         self.assertEqual(len(inputs), 3)
         self.assertEqual(program.execution_plan[0].values[inputs[1]].val.int_val, 2)
         self.assertEqual(program.execution_plan[0].values[inputs[2]].val.bool_val, True)
 
-    def test_buffers_with_perfect_alignment(self) -> None:
-        class Foo(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return torch.ones(100) + x + (torch.ones(100) * 2)
-
-        f = Foo()
-
-        program = to_edge(constant_prop_pass(export(f, (torch.randn(100),))))
-        program = program.to_executorch().executorch_program
-        self.assertEqual(len(program.constant_buffer), 2)
-        instructions = program.execution_plan[0].chains[0].instructions
-        values = program.execution_plan[0].values
-
-        # first arg to first torch_add is a dynamic tensor
-        self.check_tensor_buffer_loc(
-            instructions[1].instr_args.args[0], values, 0, 1, 800  # pyre-ignore[16]
-        )
-        # second arg to first torch_add is an input tensor
-        self.check_tensor_buffer_loc(
-            instructions[1].instr_args.args[1], values, 0, 1, 400  # pyre-ignore[16]
-        )
-        # output of first torch_add is a dynamic tensor
-        self.check_tensor_buffer_loc(
-            instructions[1].instr_args.args[3], values, 0, 1, 1200  # pyre-ignore[16]
-        )
-        # first arg to second torch_add is a dynamic tensor
-        self.check_tensor_buffer_loc(
-            instructions[-1].instr_args.args[0], values, 0, 1, 1200  # pyre-ignore[16]
-        )
-        # second arg to second torch_add is a input tensor
-        self.check_tensor_buffer_loc(
-            instructions[-1].instr_args.args[1], values, 0, 1, 0  # pyre-ignore[16]
-        )
-        # output of second torch_add is a dynamic tensor
-        self.check_tensor_buffer_loc(
-            instructions[-1].instr_args.args[3], values, 0, 1, 400  # pyre-ignore[16]
-        )
-
     def test_inplace_ops(self) -> None:
         class Foo(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -836,6 +807,48 @@ def _compare_execution_plans(
             else:
                 self.assertEqual(single_val, merged_val)
 
+    def test_emit_memory_format_valid(self) -> None:
+        class SimpleLinear(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                contiguous = x.to(
+                    dtype=torch.float32, memory_format=torch.contiguous_format
+                )
+                preserve = x.to(
+                    dtype=torch.float32, memory_format=torch.preserve_format
+                )
+                return contiguous + preserve
+
+        # Should succeed at exporting model with legal memory format (contiguous, preserve)
+        model = SimpleLinear()
+        inputs = (torch.ones(10, 5),)
+        try:
+            to_edge(
+                export(model, inputs),
+                compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
+            ).to_executorch()
+        except:
+            self.fail("Failed to export model with legal memory format")
+
+    def test_emit_memory_format_invalid(self) -> None:
+        class SimpleLinear(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.to(dtype=torch.float32, memory_format=torch.channels_last)
+
+        # Failure expected when exporting model with illegal memory format (channels_last)
+        model = SimpleLinear()
+        inputs = (torch.ones(10, 5, 2, 1),)
+        with self.assertRaises(InternalError):
+            to_edge(
+                export(model, inputs),
+                compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
+            ).to_executorch()
+
     def test_emit_multiple_entry_points(self) -> None:
         class SimpleLinear(torch.nn.Module):
             def __init__(self) -> None:
@@ -1393,3 +1406,86 @@ def forward(self, x):
         self.assertEqual(len(exec_plan.inputs), 1)
         self.assertEqual(len(program.constant_buffer), 2)
         self.assertEqual(len(program.constant_buffer[1].storage), 24)
+
+    def test_mutable_buffers(self) -> None:
+        def count_copies(gm: torch.fx.GraphModule) -> int:
+            return sum(
+                (
+                    node.target == torch.ops.aten.copy_
+                    or node.target == exir_ops.edge.aten.copy_.default
+                )
+                for node in gm.graph.nodes
+            )
+
+        class MutableStateModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("state", torch.zeros(1))
+
+            def forward(self, x):
+                y = x + self.state
+                self.state.add_(1)
+                return y
+
+        model = to_edge(
+            export(
+                MutableStateModule(),
+                (torch.zeros(1),),
+            )
+        )
+        model = model.to_executorch()
+        model.dump_executorch_program(True)
+        self.assertTrue(
+            model.executorch_program.execution_plan[0]  # pyre-ignore[16]
+            .values[0]
+            .val.allocation_info
+            is not None
+        )
+        executorch_module = _load_for_executorch_from_buffer(model.buffer)
+        self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1))
+        self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1) + 1)
+
+    def test_mutable_buffers_without_memplanned_inputs(self) -> None:
+        def count_copies(gm: torch.fx.GraphModule) -> int:
+            return sum(
+                (
+                    node.target == torch.ops.aten.copy_
+                    or node.target == exir_ops.edge.aten.copy_.default
+                )
+                for node in gm.graph.nodes
+            )
+
+        class MutableStateModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("state", torch.zeros(1))
+
+            def forward(self, x):
+                y = x + self.state
+                self.state.add_(1)
+                return y
+
+        model = to_edge(
+            export(
+                MutableStateModule(),
+                (torch.zeros(1),),
+            )
+        )
+        model = model.to_executorch(
+            config=ExecutorchBackendConfig(
+                memory_planning_pass=MemoryPlanningPass(
+                    "greedy", alloc_graph_input=False
+                ),
+                sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+            )
+        )
+        model.dump_executorch_program(True)
+        self.assertTrue(
+            model.executorch_program.execution_plan[0]  # pyre-ignore[16]
+            .values[0]
+            .val.allocation_info
+            is not None
+        )
+        executorch_module = _load_for_executorch_from_buffer(model.buffer)
+        self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1))
+        self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1) + 1)
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index 32f9813fc2..01087a8048 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -58,7 +58,7 @@ class LoweredBackendModule(torch.nn.Module):
     _compile_specs: List[
         CompileSpec
     ]  # A list of backend-specific objects with static metadata to configure the "compilation" process.
-    _original_module: ExportedProgram  # The original EXIR module
+    _original_exported_program: ExportedProgram  # The original EXIR module
 
     def __init__(
         self,
@@ -68,7 +68,7 @@ def __init__(
         compile_specs: List[CompileSpec],
     ) -> None:
         super().__init__()
-        self._original_module = edge_program
+        self._original_exported_program = edge_program
         self._backend_id = backend_id
         self._processed_bytes = processed_bytes
         self._compile_specs = compile_specs
@@ -77,14 +77,20 @@ def __init__(
     def __deepcopy__(self, memo: Optional[Dict[int, Any]]) -> "LoweredBackendModule":
         # Copy exported program
         copied_program = ExportedProgram(
-            root=copy.deepcopy(self._original_module.graph_module),
-            graph=copy.deepcopy(self._original_module.graph),
-            graph_signature=copy.deepcopy(self._original_module.graph_signature),
-            state_dict=self._original_module.state_dict,
-            range_constraints=copy.deepcopy(self._original_module.range_constraints),
-            module_call_graph=copy.deepcopy(self._original_module.module_call_graph),
-            verifier=copy.deepcopy(self._original_module.verifier),
-            constants=self._original_module.constants,
+            root=copy.deepcopy(self._original_exported_program.graph_module),
+            graph=copy.deepcopy(self._original_exported_program.graph),
+            graph_signature=copy.deepcopy(
+                self._original_exported_program.graph_signature
+            ),
+            state_dict=self._original_exported_program.state_dict,
+            range_constraints=copy.deepcopy(
+                self._original_exported_program.range_constraints
+            ),
+            module_call_graph=copy.deepcopy(
+                self._original_exported_program.module_call_graph
+            ),
+            verifier=copy.deepcopy(self._original_exported_program.verifier),
+            constants=self._original_exported_program.constants,
         )
 
         res = LoweredBackendModule(
@@ -122,7 +128,7 @@ def original_module(self) -> ExportedProgram:
         """
         Returns the original EXIR module
         """
-        return self._original_module
+        return self._original_exported_program
 
     # TODO(chenlai): consolidate the seriailization config with serialize_to_flatbuffer api
     def buffer(
@@ -135,12 +141,15 @@ def buffer(
         """
         Returns a buffer containing the serialized ExecuTorch binary.
         """
-        out = _serialize_pte_binary(
-            program=self.program(),
-            extract_delegate_segments=extract_delegate_segments,
-            segment_alignment=segment_alignment,
-            constant_tensor_alignment=constant_tensor_alignment,
-            delegate_alignment=delegate_alignment,
+        # TODO(T181463742): avoid calling bytes(..) which incurs large copies.
+        out = bytes(
+            _serialize_pte_binary(
+                program=self.program(),
+                extract_delegate_segments=extract_delegate_segments,
+                segment_alignment=segment_alignment,
+                constant_tensor_alignment=constant_tensor_alignment,
+                delegate_alignment=delegate_alignment,
+            )
         )
         return out
 
@@ -185,7 +194,7 @@ def program(self, emit_stacktrace: bool = False) -> Program:
         # We'll remove all call_function nodes, insert an call_delegate node, inserting getitems nodes to get the result for call_delegate node
         # and return the list of getitems as the output
 
-        lowered_exported_program = copy.deepcopy(self.original_module)
+        lowered_exported_program = copy.deepcopy(self._original_exported_program)
 
         # The real input nodes are the ones not buffer or parameter
         all_input_nodes = [
@@ -237,7 +246,9 @@ def program(self, emit_stacktrace: bool = False) -> Program:
         # Get the output list. Since the output node is a tuple of list, like ([aten_mul_tensor, aten_add_tensor],)
         # We add some handling logic to get the list `[aten_mul_tensor, aten_add_tensor]` properly
         original_output_nodes = [
-            node for node in self.original_module.graph.nodes if node.op == "output"
+            node
+            for node in self._original_exported_program.graph.nodes
+            if node.op == "output"
         ][0].args[0]
 
         delegate_node.meta["spec"] = tuple(
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index c38862da48..b8c47b440c 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -12,7 +12,7 @@
 import typing
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, Iterable, List, Set, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from executorch.exir import memory
@@ -29,6 +29,7 @@
 from executorch.exir.tensor import TensorSpec
 
 from torch import fx
+from torch.export.exported_program import ExportGraphSignature
 from torch.fx import Node
 from torch.utils._pytree import tree_flatten
 
@@ -47,8 +48,10 @@ def __init__(
         graph_module: torch.fx.GraphModule,
         alloc_graph_input: bool,
         alloc_graph_output: bool,
+        graph_signature: Optional[ExportGraphSignature] = None,
     ) -> None:
         self.graph_module = graph_module
+        self.graph_signature = graph_signature
         self.alloc_graph_input = alloc_graph_input
         self.alloc_graph_output = alloc_graph_output
 
@@ -135,6 +138,7 @@ def verify_storage_reuse(
         all_specs = list(
             collect_specs_from_nodes(
                 self.graph_module.graph.nodes,
+                self.graph_signature,
                 ignore_const=True,
                 ignore_graph_input=not self.alloc_graph_input,
                 ignore_graph_output=not self.alloc_graph_output,
@@ -180,7 +184,6 @@ def verify_graph_input_output(self) -> None:
         input/output is allocated by the compiler. If not, the runtime will
         set them using buffers provided by users.
         """
-
         graph_module = self.graph_module
         # There is one tricky case here. If the graph input and graph output
         # tensors have overlap, but alloc_graph_input != alloc_graph_output,
@@ -190,7 +193,7 @@ def verify_graph_input_output(self) -> None:
         #
         # Ignore the check in this case for now.
         overlap = get_graph_input_tensors(
-            graph_module.graph.nodes
+            graph_module.graph.nodes, self.graph_signature
         ) & get_graph_output_tensors(graph_module.graph.nodes)
         if overlap and (self.alloc_graph_input != self.alloc_graph_output):
             logging.debug(
@@ -213,6 +216,8 @@ def verify_graph_input_output(self) -> None:
             if nd.op in check_list:
                 if not (specs := get_node_tensor_specs(nd)):
                     continue
+                if _is_mutable_buffer(nd, self.graph_signature):
+                    continue
                 assert len(specs) > 0, "Expect tensor specs"
                 allocated = any(
                     spec is None or spec.mem_offset is not None for spec in specs
@@ -300,10 +305,31 @@ def filter_nodes(inputs: Iterable[Any]) -> Iterable[Node]:
     return [nd for nd in tree_flatten(list(inputs))[0] if isinstance(nd, Node)]
 
 
-def get_graph_input_tensors(nodes: Iterable[Node]) -> Set[TensorSpec]:
+def _is_mutable_buffer(
+    node: Node, graph_signature: Optional[ExportGraphSignature] = None
+) -> bool:
+    """
+    Check if the node is mutable buffer according to the provided graph signature.
+    """
+    # graph signature is None for memory planning passes not called from EdgeProgramManager, these paths are deprecated so mutable buffers are not supported on them.
+    if graph_signature is None:
+        return False
+    if node.op == "placeholder":
+        if isinstance(node.target, str):
+            if node.target in graph_signature.inputs_to_buffers:
+                fqn = graph_signature.inputs_to_buffers[node.target]
+                # if the buffer is mutated then record that
+                if fqn in graph_signature.buffers_to_mutate.values():
+                    return True
+    return False
+
+
+def get_graph_input_tensors(
+    nodes: Iterable[Node], graph_signature: Optional[ExportGraphSignature] = None
+) -> Set[TensorSpec]:
     graph_input_tensors = set()
     for node in nodes:
-        if node.op == "placeholder":
+        if node.op == "placeholder" and not _is_mutable_buffer(node, graph_signature):
             for spec in get_node_tensor_specs(node):
                 graph_input_tensors.add(spec)
 
@@ -322,6 +348,7 @@ def get_graph_output_tensors(nodes: Iterable[Node]) -> Set[TensorSpec]:
 
 def collect_specs_from_nodes(  # noqa: C901
     nodes: Iterable[Node],
+    graph_signature: Optional[ExportGraphSignature] = None,
     ignore_graph_input: bool = False,
     ignore_graph_output: bool = False,
     ignore_const: bool = True,
@@ -342,7 +369,7 @@ def collect_specs_from_nodes(  # noqa: C901
     """
     unique_spec = set()
     graph_input_tensors: Set[TensorSpec] = (
-        get_graph_input_tensors(nodes) if ignore_graph_input else set()
+        get_graph_input_tensors(nodes, graph_signature) if ignore_graph_input else set()
     )
     graph_output_tensors: Set[TensorSpec] = (
         get_graph_output_tensors(nodes) if ignore_graph_output else set()
@@ -406,7 +433,10 @@ def collect_specs_from_nodes(  # noqa: C901
             yield spec
 
 
-def update_all_tensors_lifetime(graph_module: torch.fx.GraphModule) -> Set[TensorSpec]:
+def update_all_tensors_lifetime(
+    graph_module: torch.fx.GraphModule,
+    graph_signature: Optional[ExportGraphSignature] = None,
+) -> Set[TensorSpec]:
     r"""
     Set the lifetime for all the tensors encountered in the Fx graph.
     """
@@ -414,6 +444,7 @@ def update_all_tensors_lifetime(graph_module: torch.fx.GraphModule) -> Set[Tenso
     for node_idx, node in enumerate(graph_module.graph.nodes):
         for spec in collect_specs_from_nodes(
             filter_nodes(itertools.chain([node], node.args, node.kwargs.values())),
+            graph_signature,
             ignore_graph_input=False,
             ignore_const=False,
             ignore_out_var_node=False,
@@ -520,6 +551,7 @@ def get_node_tensor_specs(
 def greedy(
     graph_module: torch.fx.GraphModule,
     alignment: int,
+    graph_signature: Optional[ExportGraphSignature] = None,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
 ) -> List[int]:
@@ -533,6 +565,7 @@ def greedy(
     # one.
     for spec in collect_specs_from_nodes(
         graph_module.graph.nodes,
+        graph_signature,
         do_assertion=do_assertion,
         ignore_graph_input=not alloc_graph_input,
         ignore_graph_output=not alloc_graph_output,
@@ -572,9 +605,11 @@ def greedy(
 def naive(
     graph_module: torch.fx.GraphModule,
     alignment: int,
+    graph_signature: Optional[ExportGraphSignature] = None,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
 ) -> List[int]:
+
     # allocate 'allocated' bytes from buffer with id mem_id.
     # return the starting offset of the allocated buffer.
     def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int:
@@ -591,6 +626,7 @@ def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int:
     bufsizes = typing.cast(List[int], bufsizes)
     for spec in collect_specs_from_nodes(
         graph_module.graph.nodes,
+        graph_signature,
         ignore_graph_input=not alloc_graph_input,
         ignore_graph_output=not alloc_graph_output,
     ):
@@ -695,9 +731,13 @@ def insert_calls_to_free(
 
 
 def apply_algo(
-    algo: Callable[[torch.fx.GraphModule, int, bool, bool], List[int]],
+    algo: Callable[
+        [torch.fx.GraphModule, int, Optional[ExportGraphSignature], bool, bool],
+        List[int],
+    ],
     graph_module: torch.fx.GraphModule,
     alignment: int,
+    graph_signature: Optional[ExportGraphSignature] = None,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
 ) -> List[int]:
@@ -712,9 +752,9 @@ def apply_algo(
        storage with tensors in the outer module.
     TODO: make these optimizations once we have some baseline working.
     """
-    specs = update_all_tensors_lifetime(graph_module)
+    specs = update_all_tensors_lifetime(graph_module, graph_signature)
     bufsizes: List[int] = algo(
-        graph_module, alignment, alloc_graph_input, alloc_graph_output
+        graph_module, alignment, graph_signature, alloc_graph_input, alloc_graph_output
     )
     insert_calls_to_free(graph_module, specs)
 
@@ -726,7 +766,12 @@ def handle_submodule(submodule_nd: torch.fx.Node) -> None:
         # buffer already allocated.
         submodule.input_mem_buffer_sizes = bufsizes
         bufsizes = apply_algo(
-            algo, submodule, alignment, alloc_graph_input=False, alloc_graph_output=True
+            algo,
+            submodule,
+            alignment,
+            graph_signature,
+            alloc_graph_input=False,
+            alloc_graph_output=True,
         )
         submodule.meta.update({"non_const_buffer_sizes": bufsizes})
 
diff --git a/exir/passes/insert_write_back_for_buffers_pass.py b/exir/passes/insert_write_back_for_buffers_pass.py
index 685a039842..0a7cfc4e19 100644
--- a/exir/passes/insert_write_back_for_buffers_pass.py
+++ b/exir/passes/insert_write_back_for_buffers_pass.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 import torch
 
@@ -69,7 +69,9 @@ def _insert_copy(
     return buffer_output_nodes
 
 
-def insert_write_back_for_buffers_pass(ep: ExportedProgram):
+def insert_write_back_for_buffers_pass(
+    ep: ExportedProgram,
+) -> Tuple[torch.fx.GraphModule, ExportGraphSignature]:
     gm: torch.fx.GraphModule = ep.graph_module
     lifted_inputs: List[Optional[str]] = [
         (
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
index 03e728bec0..9295cabcab 100644
--- a/exir/passes/memory_planning_pass.py
+++ b/exir/passes/memory_planning_pass.py
@@ -6,6 +6,7 @@
 
 import logging
 import warnings
+from typing import Optional
 
 import torch
 from executorch.exir.error import internal_assert
@@ -20,6 +21,7 @@
 from executorch.exir.operator.convert import get_out_args_from_opoverload
 from executorch.exir.pass_base import PassBase, PassResult
 from executorch.exir.tensor import ALIGNMENT
+from torch.export.exported_program import ExportGraphSignature
 
 
 class MemoryPlanningPass(PassBase):
@@ -82,13 +84,19 @@ def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
                         out_alloc_node.meta["spec"] = specs[i]
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        return self.run(graph_module)
+
+    def run(
+        self,
+        graph_module: torch.fx.GraphModule,
+        graph_signature: Optional[ExportGraphSignature] = None,
+    ) -> PassResult:
         """
         A pass for memory planning. The actual algorithm used will be picked by
         memory_planning_algo
         """
         self._set_alloc_node_spec(graph_module)
         algo = get_algo(self.memory_planning_algo)
-
         # TODO(shunting) if people have concern of adding a field to GraphModule
         # directly, we should define a GraphModule subclass that we can add our
         # customized fields. Using the graph_module object to convey information across
@@ -98,6 +106,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
             algo,
             graph_module,
             self.alignment,
+            graph_signature,
             self.alloc_graph_input,
             self.alloc_graph_output,
         )
@@ -105,7 +114,10 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         # TODO: make the verifier do the work recursively to handle
         # control flow
         verifier = Verifier(
-            graph_module, self.alloc_graph_input, self.alloc_graph_output
+            graph_module,
+            self.alloc_graph_input,
+            self.alloc_graph_output,
+            graph_signature,
         )
 
         if logging.getLogger().isEnabledFor(logging.DEBUG):
diff --git a/exir/passes/spec_prop_pass.py b/exir/passes/spec_prop_pass.py
index bb82342768..541a59d0df 100644
--- a/exir/passes/spec_prop_pass.py
+++ b/exir/passes/spec_prop_pass.py
@@ -6,12 +6,14 @@
 
 # pyre-strict
 
-from typing import List
+from typing import List, Optional
 
 import torch
 from executorch.exir.delegate import executorch_call_delegate
 from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
 from executorch.exir.tensor import TensorSpec
+from torch.export.exported_program import ExportGraphSignature
+from torch.fx.node import Node
 from torch.utils import _pytree as pytree
 
 
@@ -25,6 +27,25 @@ def make_spec(x):
         return None
 
 
+def _is_mutable_buffer(
+    node: Node, graph_signature: Optional[ExportGraphSignature] = None
+) -> bool:
+    """
+    Check if the node is mutable buffer according to the provided graph signature.
+    """
+    # graph signature is None for memory planning passes not called from EdgeProgramManager, these paths are deprecated so mutable buffers are not supported on them.
+    if graph_signature is None:
+        return False
+    if node.op == "placeholder":
+        if isinstance(node.target, str):
+            if node.target in graph_signature.inputs_to_buffers:
+                fqn = graph_signature.inputs_to_buffers[node.target]
+                # if the buffer is mutated then record that
+                if fqn in graph_signature.buffers_to_mutate.values():
+                    return True
+    return False
+
+
 class SpecPropPass(ExportPass):
     def __init__(self) -> None:
         super().__init__()
@@ -53,6 +74,10 @@ def update_placeholder_tensor_specs(
             spec = node.meta["spec"]
             if isinstance(node.target, str) and (
                 node.target in exported_program.graph_signature.inputs_to_parameters
+                or (
+                    node.target in exported_program.graph_signature.inputs_to_buffers
+                    and not _is_mutable_buffer(node, exported_program.graph_signature)
+                )
             ):
                 spec.const = True
 
diff --git a/exir/program/__init__.py b/exir/program/__init__.py
index 31baf9c096..e6b290d8c8 100644
--- a/exir/program/__init__.py
+++ b/exir/program/__init__.py
@@ -13,9 +13,6 @@
     ExecutorchProgram,
     ExecutorchProgramManager,
     ExirExportedProgram,
-    multi_method_program_to_executorch,
-    MultiMethodExecutorchProgram,
-    MultiMethodExirExportedProgram,
     to_edge,
 )
 
@@ -25,9 +22,6 @@
     "_to_edge",
     "to_edge",
     "edge_to_executorch_passes",
-    "MultiMethodExirExportedProgram",
-    "MultiMethodExecutorchProgram",
-    "multi_method_program_to_executorch",
     "EdgeProgramManager",
     "ExecutorchProgramManager",
 ]
diff --git a/exir/program/_program.py b/exir/program/_program.py
index fe2c5d104c..3593cf9205 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
+import io
 import logging
 from typing import Any, Dict, List, Optional, Sequence, Set, Union
 
@@ -12,6 +13,7 @@
 import torch._export
 
 from executorch.exir._serialize import _serialize_pte_binary
+from executorch.exir._serialize._cord import Cord
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
@@ -362,6 +364,15 @@ def to_executorch(
             new_gm_res = p(new_gm)
             assert new_gm_res is not None
             new_gm = new_gm_res.graph_module
+
+        # This is tech debt on tech debt. memory planning pass inherits from some pass infra for GMs.
+        # This isnt enough info now so i cant use call I have to use some new function 'run'.
+        # Existing user passes dont use run so Im just cheating here because they dont need to work on mutable buffers yet.
+        # After exir.capture is gone I will clean up the memory planning infra to be consistent.
+        # Frankly all of exir has big code quality issues because of the migrations that need to be addressed.
+        new_gm_res = config.memory_planning_pass(new_gm)  # pyre-ignore[19]
+        assert new_gm_res is not None
+        new_gm = new_gm_res.graph_module
         new_prog = ExirExportedProgram(
             copy.deepcopy(self.exported_program), self.after_to_edge_passes
         )
@@ -409,6 +420,7 @@ def __init__(
                 "Need to call prog.to_edge prior to constructing ExecutorchProgram."
             )
         self.exported_program = exir_exported_program.exported_program
+        self._pte_data: Optional[Cord] = None
         self._buffer: Optional[bytes] = None
         self._emitter_output: Optional[EmitterOutput] = None
         self._emit_stacktrace: bool = emit_stacktrace
@@ -418,10 +430,9 @@ def __init__(
         self._constant_tensor_alignment: Optional[int] = constant_tensor_alignment
         self._delegate_alignment: Optional[int] = delegate_alignment
 
-    @property
-    def buffer(self) -> bytes:
-        if self._buffer is None:
-            self._buffer = _serialize_pte_binary(
+    def _get_pte_data(self) -> Cord:
+        if self._pte_data is None:
+            self._pte_data = _serialize_pte_binary(
                 program=self.program,
                 extract_delegate_segments=self._extract_delegate_segments,
                 extract_constant_segment=self._extract_constant_segment,
@@ -429,6 +440,20 @@ def buffer(self) -> bytes:
                 constant_tensor_alignment=self._constant_tensor_alignment,
                 delegate_alignment=self._delegate_alignment,
             )
+        return self._pte_data
+
+    @property
+    def buffer(self) -> bytes:
+        """Returns the serialized ExecuTorch binary as a byte string.
+
+        Note that the call to `buffer` may allocate a very large amount of
+        contiguous memory, depending on the model size. If writing to a file,
+        use `write_to_file` which won't incur additional copies.
+        """
+        # TODO(T181494963): update pybinding to remove buffer cache, which can consume large
+        # amounts of memory longer than necessary.
+        if self._buffer is None:
+            self._buffer = bytes(self._get_pte_data())
         return self._buffer
 
     @property
@@ -464,6 +489,14 @@ def dump_graph_module(self) -> torch.fx.GraphModule:
     def dump_exported_program(self) -> ExportedProgram:
         return self.exported_program
 
+    def write_to_file(self, open_file: io.BufferedIOBase) -> None:
+        """
+        Writes the serialized ExecuTorch binary to the file at `open_file`. Prefer to use this over
+        `buffer`, as it writes to file without copying into a contiguous block of memory first,
+        reducing the peak memory usage.
+        """
+        self._get_pte_data().write_to_file(open_file)
+
 
 def _get_aten_to_edge_passes(config: EdgeCompileConfig):
     # TODO: the last two passes for aten_to_edge need to be eliminated_dead_code -> debug_handle_generator. After enable
@@ -559,250 +592,10 @@ def edge_to_executorch_passes(config: ExecutorchBackendConfig) -> List[PassType]
         RemoveGraphAssertsPass(),
         config.sym_shape_eval_pass,
         config.to_out_var_pass,
-        config.memory_planning_pass,
     ]
     return passes
 
 
-# MultiMethodExirExportedProgram represents an exported program that contains
-# multiple methods, all as valid entry points to the program.
-#
-# Internally, each method is represented as a separate ExirExportedProgram.
-# Methods (fx.GraphModule's) do not share anything with each other to
-# ensure that each is self-contained. This is important because transformation
-# passes can be local and do not need to concern themselves about other methods
-# that exists on the same MultiMethodExirExportedProgram.
-#
-# TODO(T152006915): Merge this into ExirExportedProgram and then delete it.
-@compatibility(is_backward_compatible=False)
-class MultiMethodExirExportedProgram:
-    def __init__(
-        self,
-        progs: Dict[str, ExirExportedProgram],
-        getters: Optional[Dict[str, Any]] = None,
-    ):
-        # TODO(ycao): Support merging use case where user started by creating
-        # an empty MultiMethodExirExportedProgram and then start adding more
-        # graph modules to it.
-        assert (
-            len(progs) > 0
-        ), "Expected at least 1 graph module in MultiMethodExirExportedProgram"
-        self._method_to_program = progs
-        self._method_to_prim_getter = getters
-
-    # Get the default method, which is either the only method contained
-    # in this MultiMethodExirExportedProgram or the method named `forward`.
-    def _get_default_program(self):
-        if len(self._method_to_program) == 1:
-            return next(iter(self._method_to_program.values()))
-        elif "forward" in self._method_to_program:
-            return self._method_to_program["forward"]
-        else:
-            return None
-
-    def save(self) -> None:
-        # TODO(ycao): Implement.
-        raise NotImplementedError()
-
-    def load(self) -> None:
-        # TODO(ycao): Implement.
-        raise NotImplementedError()
-
-    def find_method(self, name: str) -> Optional[ExirExportedProgram]:
-        return self._method_to_program.get(name)
-
-    def merge(self, other: "MultiMethodExirExportedProgram"):
-        for method_name, program in other.methods().items():
-            assert (
-                method_name not in self._method_to_program
-            ), f"There already is a method named {method_name} in this program"
-            self._method_to_program[method_name] = program
-
-    def transform(self, *passes: PassType) -> "MultiMethodExirExportedProgram":
-        method_name_to_transformed_program = {
-            method_name: prog.transform(*passes)
-            for method_name, prog in self._method_to_program.items()
-        }
-        return MultiMethodExirExportedProgram(method_name_to_transformed_program)
-
-    def methods(self) -> Dict[str, ExirExportedProgram]:
-        return self._method_to_program
-
-    def prim_getters(self) -> Optional[Dict[str, Any]]:
-        return self._method_to_prim_getter
-
-    def __call__(self, *args: Val, **kwargs: Val) -> Val:
-        prog = self._get_default_program()
-
-        assert (
-            prog is not None
-        ), """MultiMethodExirExportedProgram can not be called directly unless "
-        "it only contains a single method or it contains a `forward` method. "
-        "Please look up one of its methods first via "
-        "`MultiMethodExirExportedProgram.find_method(method_name)`."""
-
-        return prog(*args, **kwargs)
-
-    def __repr__(self) -> str:
-        # TODO(ycao): Implement.
-        raise NotImplementedError()
-
-    def __str__(self) -> str:
-        # TODO(ycao): Implement a real one.
-        return super().__str__()
-
-    def access_property_of_default_method(self, property_name: str):
-        default_program = self._get_default_program()
-        assert (
-            default_program is not None
-        ), f"""Exported program contains more than one methods and none of them "
-        "is named `forward`, it is impossible to identify the default method. "
-        "please look up one of its methods first via `find_method(method_name)` "
-        "to access property: {property_name}."""
-        return getattr(default_program.exported_program.graph_module, property_name)
-
-    @property
-    def graph(self):
-        return self.access_property_of_default_method("graph")
-
-    @property
-    def code(self):
-        return self.access_property_of_default_method("code")
-
-    @property
-    def module(self):
-        default_prog = self._get_default_program()
-        assert (
-            default_prog is not None
-        ), """Exported program contains more than"
-        " one methods and none of them is named `forward`,"
-        " it is impossible to identify the default method "
-        "to fetch GraphModule for."""
-        return default_prog.exported_program.graph_module
-
-    # TODO(ycao): Implement custom __reduce__ to account for lost of
-    # meta['val']
-
-    # TODO(ycao): Change this to a composable function.
-    def to_edge(
-        self, config: Optional[EdgeCompileConfig] = None
-    ) -> "MultiMethodExirExportedProgram":
-        if config is None:
-            config = EdgeCompileConfig()
-        method_name_to_edge_prog = {
-            method_name: prog.to_edge(config)
-            for method_name, prog in self.methods().items()
-        }
-        return MultiMethodExirExportedProgram(
-            method_name_to_edge_prog,
-            self.prim_getters(),
-        )
-
-    # TODO(ycao): Change this to a composable function.
-    def to_executorch(
-        self,
-        config: Optional[ExecutorchBackendConfig] = None,
-    ) -> "MultiMethodExecutorchProgram":
-        return multi_method_program_to_executorch(self, config)
-
-
-# TODO(T152006915): Merge this into ExecutorchProgram and then delete it.
-@compatibility(is_backward_compatible=False)
-class MultiMethodExecutorchProgram:
-    def __init__(
-        self,
-        executorch_dialect_program: "MultiMethodExirExportedProgram",
-        emit_stacktrace: bool,
-        extract_delegate_segments: bool,
-        extract_constant_segment: bool,
-        segment_alignment: int,
-        constant_tensor_alignment: Optional[int] = None,
-        delegate_alignment: Optional[int] = None,
-        prim_getters: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        self._buffer: Optional[bytes] = None
-        temp: Dict[str, ExportedProgram] = {}
-        for name, prog in executorch_dialect_program.methods().items():
-            temp[name] = prog.exported_program
-        self._emitter_output: EmitterOutput = emit_program(
-            temp,
-            emit_stacktrace,
-            executorch_dialect_program.prim_getters(),
-        )
-        self._executorch_dialect_ir_program = executorch_dialect_program
-        self._extract_delegate_segments: bool = extract_delegate_segments
-        self._extract_constant_segment: bool = extract_constant_segment
-        self._segment_alignment: int = segment_alignment
-        self._constant_tensor_alignment: Optional[int] = constant_tensor_alignment
-        self._delegate_alignment: Optional[int] = delegate_alignment
-        self._prim_getter_cache = prim_getters
-
-    @property
-    def buffer(self) -> bytes:
-        if self._buffer is None:
-            self._buffer = _serialize_pte_binary(
-                program=self._emitter_output.program,
-                extract_delegate_segments=self._extract_delegate_segments,
-                extract_constant_segment=self._extract_constant_segment,
-                segment_alignment=self._segment_alignment,
-                constant_tensor_alignment=self._constant_tensor_alignment,
-                delegate_alignment=self._delegate_alignment,
-            )
-        return self._buffer
-
-    @property
-    def program(self) -> Program:
-        return self._emitter_output.program
-
-    @property
-    def debug_handle_map(self) -> Dict[int, Union[int, List[int]]]:
-        return self._emitter_output.debug_handle_map
-
-    @property
-    def delegate_map(
-        self,
-    ) -> Dict[str, Dict[int, Dict[str, Union[str, _DelegateDebugIdentifierMap]]]]:
-        if self._emitter_output:
-            return self._emitter_output.method_to_delegate_debug_id_map
-        return {}
-
-    # TODO(ycao): This doesn't make sense any more, remove/change later.
-    def dump_graph_module(self) -> torch.fx.GraphModule:
-        return self.get_multi_method_graph_module().module
-
-    def get_multi_method_graph_module(self) -> "MultiMethodExirExportedProgram":
-        return self._executorch_dialect_ir_program
-
-
-# TODO(T152006915): Merge this into to_executorch and then delete it.
-def multi_method_program_to_executorch(
-    edge_dialect_program: MultiMethodExirExportedProgram,
-    config: Optional[ExecutorchBackendConfig] = None,
-) -> MultiMethodExecutorchProgram:
-    config = config or ExecutorchBackendConfig()
-    passes = edge_to_executorch_passes(config)
-    res = {}
-    for method_name, prog in edge_dialect_program._method_to_program.items():
-        new_prog = copy.deepcopy(prog)
-        gm = prog.exported_program.graph_module
-        for p in passes:
-            gm_res = p(gm)
-            assert gm_res is not None
-            gm = gm_res.graph_module
-        _copy_module(new_prog.exported_program.graph_module, gm)
-        res[method_name] = new_prog
-    return MultiMethodExecutorchProgram(
-        executorch_dialect_program=MultiMethodExirExportedProgram(res),
-        emit_stacktrace=config.emit_stacktrace,
-        extract_delegate_segments=config.extract_delegate_segments,
-        extract_constant_segment=config.extract_constant_segment,
-        segment_alignment=config.segment_alignment,
-        constant_tensor_alignment=config.constant_tensor_alignment,
-        delegate_alignment=config.delegate_alignment,
-        prim_getters=edge_dialect_program.prim_getters(),
-    )
-
-
 def to_edge(
     programs: Union[ExportedProgram, Dict[str, ExportedProgram]],
     constant_methods: Optional[Dict[str, Any]] = None,
@@ -1039,7 +832,7 @@ def to_executorch(
 
         execution_programs: Dict[str, ExportedProgram] = {}
         for name, program in self._edge_programs.items():
-            gm, _ = insert_write_back_for_buffers_pass(program)
+            gm, new_signature = insert_write_back_for_buffers_pass(program)
             new_gm = program.graph_module
             for p in edge_to_executorch_passes(config):
                 new_gm_res = p(new_gm)
@@ -1059,6 +852,14 @@ def to_executorch(
                     p.update_placeholder_tensor_specs(program, new_gm)
 
             # TODO(jakeszwe): Follow up with compiler on if the deepcopy is necessary and if so how to make it work
+            if hasattr(config.memory_planning_pass, "run"):
+                new_gm_res = config.memory_planning_pass.run(  # pyre-ignore[16]
+                    new_gm, new_signature
+                )
+            else:
+                new_gm_res = config.memory_planning_pass(new_gm)  # pyre-ignore[19]
+            assert new_gm_res is not None
+            new_gm = new_gm_res.graph_module
 
             _copy_module(program.graph_module, new_gm)
             execution_programs[name] = program
@@ -1116,8 +917,8 @@ def __init__(
             self._config_methods,
         )
 
-        # Serialize emitter output to a buffer
-        self._buffer: bytes = _serialize_pte_binary(
+        # Serialize emitter output, ready to be written to a file.
+        self._pte_data: Cord = _serialize_pte_binary(
             program=self._emitter_output.program,
             extract_delegate_segments=backend_config.extract_delegate_segments,
             extract_constant_segment=backend_config.extract_constant_segment,
@@ -1125,6 +926,7 @@ def __init__(
             constant_tensor_alignment=backend_config.constant_tensor_alignment,
             delegate_alignment=backend_config.delegate_alignment,
         )
+        self._buffer: Optional[bytes] = None
 
     @property
     def methods(self) -> Set[str]:
@@ -1179,7 +981,22 @@ def executorch_program(self) -> Program:
 
     @property
     def buffer(self) -> bytes:
+        """Returns the serialized ExecuTorch binary as a byte string.
+
+        Note that the call to `buffer` may allocate a very large amount of
+        contiguous memory, depending on the model size. If writing to a file,
+        use `write_to_file` which won't incur additional copies.
         """
-        Returns a buffer containing the serialized ExecuTorch binary.
-        """
+        # TODO(T181494963): update pybinding to remove buffer cache, which can consume large
+        # amounts of memory longer than necessary.
+        if self._buffer is None:
+            self._buffer = bytes(self._pte_data)
         return self._buffer
+
+    def write_to_file(self, open_file: io.BufferedIOBase) -> None:
+        """
+        Writes the serialized ExecuTorch binary to the file at `open_file`. Prefer to use this over
+        `buffer`, as it writes to file without copying into a contiguous block of memory first,
+        reducing the peak memory usage.
+        """
+        self._pte_data.write_to_file(open_file)
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
index 8c2ddddb7c..01de1f3bef 100644
--- a/exir/program/test/test_program.py
+++ b/exir/program/test/test_program.py
@@ -30,6 +30,16 @@
 
 from torch.library import impl, Library
 
+
+class WrapperModule(torch.nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, *args, **kwargs):
+        return self.fn(*args, **kwargs)
+
+
 lib = Library("test_op", "DEF")
 
 # Fake a operator for testing.
@@ -374,7 +384,7 @@ def _test_edge_dialect_verifier(self, callable, validate_ir=True):
             two,
         )
         if not isinstance(callable, torch.nn.Module):
-            callable = torch.export.WrapperModule(callable)
+            callable = WrapperModule(callable)
 
         exported_foo = export(callable, inputs)
         _ = to_edge(exported_foo, compile_config=edge_compile_config)
diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py
index 7d07b20bfd..dbad50e1cd 100644
--- a/exir/serde/export_serialize.py
+++ b/exir/serde/export_serialize.py
@@ -220,7 +220,7 @@ def serialize_tensor_meta(t: torch.Tensor) -> TensorMeta:
         requires_grad=t.requires_grad,
         device=Device(type=t.device.type, index=t.device.index),
         strides=[serialize_sym_int(s) for s in t.stride()],
-        storage_offset=0,
+        storage_offset=serialize_sym_int(0),
         layout=_TORCH_TO_SERIALIZE_LAYOUT[t.layout],
     )
 
@@ -644,7 +644,9 @@ def serialize_optional_tensor_args(a):
                     if a is None:
                         return OptionalTensorArgument.create(as_none=())
                     elif isinstance(a, torch.fx.Node):
-                        return OptionalTensorArgument.create(as_tensor=a.name)
+                        return OptionalTensorArgument.create(
+                            as_tensor=TensorArgument(name=a.name)
+                        )
                     else:
                         raise SerializeError(f"Unsupported list/tuple argument: {a}")
 
@@ -664,7 +666,9 @@ def serialize_optional_tensor_args(a):
                     if a is None:
                         return OptionalTensorArgument.create(as_none=())
                     elif isinstance(a, inductor_tensor_buffers):
-                        return OptionalTensorArgument.create(as_tensor=a.get_name())
+                        return OptionalTensorArgument.create(
+                            as_tensor=TensorArgument(name=a.get_name())
+                        )
                     else:
                         raise SerializeError(f"Unsupported list/tuple argument: {a}")
 
diff --git a/exir/tensor.py b/exir/tensor.py
index b833f2ab4b..ee2633654e 100644
--- a/exir/tensor.py
+++ b/exir/tensor.py
@@ -231,6 +231,7 @@ def memory_format_enum(memory_format: torch.memory_format) -> int:
     )
     table = {
         torch.contiguous_format: 0,
+        torch.preserve_format: 1,
     }
     return table[memory_format]
 
diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS
index 444314d349..123ba5160e 100644
--- a/exir/tests/TARGETS
+++ b/exir/tests/TARGETS
@@ -173,6 +173,7 @@ python_unittest(
         "//executorch/exir:schema",
         "//executorch/exir/backend:backend_api",
         "//executorch/exir/passes:lib",
+        "//executorch/exir/passes:sym_shape_eval_pass",
     ],
 )
 
diff --git a/exir/tests/models.py b/exir/tests/models.py
index 1f4b91de9f..c9eb076193 100644
--- a/exir/tests/models.py
+++ b/exir/tests/models.py
@@ -13,8 +13,10 @@
 
 import torch  # noqa: F401
 import torch.nn as nn
+from executorch.exir import to_edge
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from torch import Tensor
+from torch.export import export
 
 # TODO: add one more test for data dependent op plus repeat
 
@@ -148,13 +150,14 @@ def get_random_inputs(self) -> Tuple[Tensor, Tensor]:
                 return (torch.randn(1, 3), torch.randn(1, 3))
 
         delegated_m = DelegateAdd()
-        edge_ir_m = exir.capture(
-            delegated_m,
-            delegated_m.get_random_inputs(),
-            exir.CaptureConfig(),
-        ).to_edge()
+        edge_ir_m = to_edge(
+            export(
+                delegated_m,
+                delegated_m.get_random_inputs(),
+            )
+        )
         lowered_module = LoweredBackendModule(
-            edge_program=edge_ir_m.exported_program,
+            edge_program=edge_ir_m.exported_program(),
             backend_id="backend_demo",
             processed_bytes=bytes("basic_module_add", encoding="utf8"),
             compile_specs=[],
diff --git a/exir/tests/test_capture.py b/exir/tests/test_capture.py
index 7663ed5908..4c11d4ec3f 100644
--- a/exir/tests/test_capture.py
+++ b/exir/tests/test_capture.py
@@ -7,7 +7,6 @@
 # pyre-strict
 
 import unittest
-from typing import Tuple
 
 import executorch.exir as exir
 import executorch.exir.tests.models as models
@@ -26,394 +25,3 @@ def test_module_call(self, model_name: str, model: torch.nn.Module) -> None:
         exported_program = exir.capture(model, inputs, exir.CaptureConfig())
 
         self.assertTrue(torch.allclose(expected, exported_program(*inputs)))
-
-    def test_capture_multiple(self) -> None:
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def method1(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-            def method2(
-                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
-            ) -> torch.Tensor:
-                return x + y - z
-
-        module = MultipleMethodModule()
-        method_name_to_args = {
-            "forward": (torch.rand(2, 2), torch.rand(2, 2)),
-            "method1": (torch.rand(2, 2),),
-            "method2": (torch.rand(2, 2), torch.rand(2, 2), torch.rand(2, 2)),
-        }
-
-        mmep = exir.capture_multiple(module, method_name_to_args)
-
-        for method_name, args in method_name_to_args.items():
-            eager_method = getattr(module, method_name)
-            eager_results = eager_method(*args)
-
-            exported_method = mmep.find_method(method_name)
-            self.assertIsNotNone(exported_method)
-            exported_results = exported_method(*args)
-
-            self.assertTrue(torch.allclose(eager_results, exported_results))
-
-    def test_capture_multiple_merge(self) -> None:
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def method1(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-        class AnotherMultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def method2(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def method3(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-        module1 = MultipleMethodModule()
-        method_name_to_args1 = {
-            "forward": (torch.rand(2, 2), torch.rand(2, 2)),
-            "method1": (torch.rand(2, 2),),
-        }
-
-        module2 = AnotherMultipleMethodModule()
-        method_name_to_args2 = {
-            "method2": (torch.rand(2, 2), torch.rand(2, 2)),
-            "method3": (torch.rand(2, 2),),
-        }
-
-        mmep1 = exir.capture_multiple(module1, method_name_to_args1)
-        mmep2 = exir.capture_multiple(module2, method_name_to_args2)
-
-        mmep1.merge(mmep2)
-        self.assertEqual(
-            len(mmep1.methods()), len(method_name_to_args1) + len(method_name_to_args2)
-        )
-
-        for method_name, args in method_name_to_args1.items():
-            eager_method = getattr(module1, method_name)
-            eager_results = eager_method(*args)
-
-            exported_method = mmep1.find_method(method_name)
-            self.assertIsNotNone(exported_method)
-            exported_results = exported_method(*args)
-
-            self.assertTrue(torch.allclose(eager_results, exported_results))
-
-        for method_name, args in method_name_to_args2.items():
-            eager_method = getattr(module2, method_name)
-            eager_results = eager_method(*args)
-
-            exported_method = mmep1.find_method(method_name)
-            self.assertIsNotNone(exported_method)
-            exported_results = exported_method(*args)
-
-            self.assertTrue(torch.allclose(eager_results, exported_results))
-
-    def test_capture_multiple_merge_failure(self) -> None:
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def method1(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-        class AnotherMultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def method1(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def method2(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-        module1 = MultipleMethodModule()
-        method_name_to_args1 = {
-            "forward": (torch.rand(2, 2), torch.rand(2, 2)),
-            "method1": (torch.rand(2, 2),),
-        }
-
-        module2 = AnotherMultipleMethodModule()
-        method_name_to_args2 = {
-            "method1": (torch.rand(2, 2), torch.rand(2, 2)),
-            "method2": (torch.rand(2, 2),),
-        }
-
-        mmep1 = exir.capture_multiple(module1, method_name_to_args1)
-        mmep2 = exir.capture_multiple(module2, method_name_to_args2)
-
-        with self.assertRaisesRegex(
-            AssertionError, "There already is a method named method1"
-        ):
-            mmep1.merge(mmep2)
-
-    def test_capture_multiple_part_of_method(self) -> None:
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def method1(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-            def method2(self, x: torch.Tensor) -> torch.Tensor:
-                return x + 1
-
-        module = MultipleMethodModule()
-        method_name_to_args = {
-            "forward": (torch.rand(2, 2), torch.rand(2, 2)),
-            "method1": (torch.rand(2, 2),),
-            # Intentionally do not capture method2
-        }
-
-        mmep = exir.capture_multiple(module, method_name_to_args)
-
-        # Check that only `forward` and `method1` are captured.
-        self.assertEqual(len(mmep.methods()), 2)
-
-        for method_name, args in method_name_to_args.items():
-            eager_method = getattr(module, method_name)
-            eager_results = eager_method(*args)
-
-            exported_method = mmep.find_method(method_name)
-            self.assertIsNotNone(exported_method)
-            exported_results = exported_method(*args)
-
-            self.assertTrue(torch.allclose(eager_results, exported_results))
-
-    def test_capture_multiple_no_method_specified(self) -> None:
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def method1(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-        module = MultipleMethodModule()
-        method_name_to_args = {}
-
-        with self.assertRaisesRegex(AssertionError, "Expected at least 1 graph module"):
-            _ = exir.capture_multiple(module, method_name_to_args)
-
-    def test_capture_multiple_program_property_access_success_forward(self) -> None:
-        """
-        A MultiMethodExirExportedProgram should allow property access even if
-        it contains multiple methods as long as one of the method is named
-        `forward`
-        """
-
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def method1(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-        module = MultipleMethodModule()
-        method_name_to_args = {
-            "forward": (torch.rand(2, 2), torch.rand(2, 2)),
-            "method1": (torch.rand(2, 2),),
-        }
-
-        mmep = exir.capture_multiple(module, method_name_to_args)
-        self.assertEqual(len(mmep.methods()), 2)
-
-        forward_method_prog = mmep.find_method("forward")
-        forward_method_gm = forward_method_prog.exported_program.graph_module
-        self.assertEqual(mmep.module, forward_method_gm)
-        self.assertEqual(mmep.graph, forward_method_gm.graph)
-        self.assertEqual(mmep.code, forward_method_gm.code)
-
-    def test_capture_multiple_program_property_access_success_non_forward(self) -> None:
-        """
-        A MultiMethodExirExportedProgram should allow property access if it only
-        contains a single method even if the method isn't named `forward`
-        """
-
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def method1(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-        module = MultipleMethodModule()
-        method_name_to_args = {
-            "method1": (torch.rand(2, 2),),
-        }
-
-        mmep = exir.capture_multiple(module, method_name_to_args)
-        self.assertEqual(len(mmep.methods()), 1)
-
-        method1_prog = mmep.find_method("method1")
-        method1_gm = method1_prog.exported_program.graph_module
-        self.assertEqual(mmep.module, method1_gm)
-        self.assertEqual(mmep.graph, method1_gm.graph)
-        self.assertEqual(mmep.code, method1_gm.code)
-
-    def test_capture_multiple_program_property_access_failure(self) -> None:
-        """
-        A MultiMethodExirExportedProgram should NOT allow property access when
-        there are multiple methods captured and none of them is named `forward`
-        """
-
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def method1(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-            def method2(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x + 1
-
-        module = MultipleMethodModule()
-        method_name_to_args = {
-            "method1": (torch.rand(2, 2),),
-            "method2": (torch.rand(2, 2),),
-        }
-
-        mmep = exir.capture_multiple(module, method_name_to_args)
-        self.assertEqual(len(mmep.methods()), 2)
-
-        with self.assertRaisesRegex(
-            AssertionError, "impossible to identify the default method"
-        ):
-            _ = mmep.module
-
-        with self.assertRaisesRegex(
-            AssertionError, "impossible to identify the default method"
-        ):
-            _ = mmep.graph
-
-        with self.assertRaisesRegex(
-            AssertionError, "impossible to identify the default method"
-        ):
-            _ = mmep.code
-
-    def test_capture_multiple_non_module_callable(self) -> None:
-        def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-            return x + y
-
-        args = (torch.rand(2, 2), torch.rand(2, 2))
-        mmep = exir.capture_multiple(fn, args)
-        self.assertEqual(len(mmep.methods()), 1)
-
-        eager_results = fn(*args)
-
-        exported_method = mmep.find_method("forward")
-        self.assertIsNotNone(exported_method)
-        exported_results = exported_method(*args)
-
-        self.assertTrue(torch.allclose(eager_results, exported_results))
-
-    def test_capture_multiple_non_module_callable_dict_args(self) -> None:
-        def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-            return x + y
-
-        method_name_to_args = {
-            "forward": (torch.rand(2, 2), torch.rand(2, 2)),
-        }
-
-        with self.assertRaisesRegex(
-            AssertionError, "must be a tuple of tracing inputs"
-        ):
-            _ = exir.capture_multiple(fn, method_name_to_args)
-
-    def test_capture_multiple_capture_default_forward(self) -> None:
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def method1(self, x: torch.Tensor) -> torch.Tensor:
-                return x + x
-
-        module = MultipleMethodModule()
-        args = (torch.rand(2, 2), torch.rand(2, 2))
-
-        mmep = exir.capture_multiple(module, args)
-
-        self.assertEqual(len(mmep.methods()), 1)
-
-        eager_results = module(*args)
-
-        exported_method = mmep.find_method("forward")
-        self.assertIsNotNone(exported_method)
-        exported_results = exported_method(*args)
-
-        self.assertTrue(torch.allclose(eager_results, exported_results))
-
-    def test_capture_prim(self) -> None:
-        class MultipleMethodModule(torch.nn.Module):
-            def __init__(
-                self,
-                prim_int: int,
-                prim_str: str,
-                prim_float: float,
-                prim_bool: Tuple[bool, bool],
-            ) -> None:
-                super().__init__()
-                self.prim_int = prim_int
-                self.prim_str = prim_str
-                self.prim_float = prim_float
-                self.prim_bool = prim_bool
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
-
-            def getter_int(self):
-                return self.prim_int
-
-            def getter_str(self):
-                return self.prim_str
-
-            def getter_bool(self):
-                return self.prim_bool
-
-            def getter_float(self):
-                return self.prim_float
-
-        module = MultipleMethodModule(2, "foo", 3.14, (True, False))
-        args = (torch.rand(2, 2), torch.rand(2, 2))
-
-        captured = exir.capture_multiple(
-            module,
-            args,
-            prim_getters=["getter_int", "getter_str", "getter_bool", "getter_float"],
-        )
-
-        self.assertEqual(len(captured.methods()), 1)
-        self.assertEqual(len(captured.prim_getters()), 4)
-        getters = captured.prim_getters()
-        self.assertEqual(getters["getter_int"], 2)
-        self.assertEqual(getters["getter_float"], 3.14)
-        self.assertEqual(getters["getter_str"], "foo")
-        self.assertEqual(getters["getter_bool"], (True, False))
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index bdada3ca60..0dbff33a77 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -8,7 +8,7 @@
 
 import itertools
 import unittest
-from typing import Callable, List, Optional, Tuple, Type
+from typing import Any, Callable, List, Optional, Tuple, Type
 
 import executorch.exir as exir
 import executorch.exir.schema as schema
@@ -18,8 +18,13 @@
 from executorch.backends.fb.qnnpack.partition.qnnpack_partitioner import (
     QnnpackPartitioner,
 )
+from executorch.exir import ExecutorchBackendConfig, to_edge
 from executorch.exir.backend.backend_api import to_backend, validation_disabled
-from executorch.exir.memory_planning import filter_nodes, Verifier
+from executorch.exir.memory_planning import (
+    filter_nodes,
+    get_node_tensor_specs,
+    Verifier,
+)
 from executorch.exir.pass_base import PassResult
 from executorch.exir.pass_manager import PassManager
 from executorch.exir.passes import (  # noqa
@@ -30,6 +35,7 @@
     SpecPropPass,
     ToOutVarPass,
 )
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.exir.print_program import print_program
 from executorch.exir.tests.asr_joiner import ASRJoiner
 from parameterized import parameterized
@@ -50,6 +56,7 @@
     _convert_to_reference_decomposed_fx,
     prepare_fx,
 )
+from torch.export import export
 from torch.fx import Graph, GraphModule, Node
 from torch.nn import functional as F
 
@@ -562,3 +569,75 @@ def test_multiple_pools(
                 self.assertEqual(node.meta["spec"].mem_offset, mem_offset)
                 idx += 1
         self.assertEqual(graph_module.meta["non_const_buffer_sizes"], expected_bufsizes)
+
+    def test_constants_not_memory_planned(self) -> None:
+        class Simple(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+                self.register_buffer("constant", torch.ones(5, 5))
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.sigmoid(self.linear(x) + self.constant + 1)
+
+        def count_planned_inputs(
+            nodes: List[Node], graph_signature: Any  # pyre-ignore
+        ) -> Tuple[int, int]:
+            num_mem_planned_placeholders = 0
+            num_placeholders = 0
+            for node in nodes:
+                if node.op == "placeholder":
+                    num_placeholders += 1
+                    specs = get_node_tensor_specs(node)
+                    self.assertGreaterEqual(len(specs), 1)
+                    for spec in specs:
+                        if spec.mem_id is not None:
+                            num_mem_planned_placeholders += 1
+            return num_placeholders, num_mem_planned_placeholders
+
+        model = Simple()
+        inputs = (torch.randn(5, 5),)
+
+        ep_no_input_planning = to_edge(export(model, inputs)).to_executorch(
+            config=ExecutorchBackendConfig(
+                memory_planning_pass=MemoryPlanningPass(
+                    "greedy", alloc_graph_input=False
+                ),
+                sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+            )
+        )
+
+        num_placeholders, num_planned_placeholders = count_planned_inputs(
+            ep_no_input_planning.exported_program().graph_module.graph.nodes,
+            ep_no_input_planning.exported_program().graph_signature,
+        )
+        self.assertEqual(
+            num_planned_placeholders,
+            0,
+        )  # one unplanned user input and 4 constants that shouldnt be planned
+        self.assertEqual(
+            num_placeholders,
+            5,  # x, self.constant, linear weight, linear bias, '1' scalar promoted to tensor
+        )
+
+        ep_input_planning = to_edge(export(model, inputs)).to_executorch(
+            config=ExecutorchBackendConfig(
+                memory_planning_pass=MemoryPlanningPass(
+                    "greedy", alloc_graph_input=True
+                ),
+                sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+            )
+        )
+
+        num_placeholders, num_planned_placeholders = count_planned_inputs(
+            ep_input_planning.exported_program().graph_module.graph.nodes,
+            ep_input_planning.exported_program().graph_signature,
+        )
+        self.assertEqual(
+            num_planned_placeholders,
+            1,
+        )  # one planned user input and 4 constants that shouldnt be planned
+        self.assertEqual(
+            num_placeholders,
+            5,
+        )
diff --git a/exir/tests/test_tracer.py b/exir/tests/test_tracer.py
index 1fc49c68d5..82c7ab118c 100644
--- a/exir/tests/test_tracer.py
+++ b/exir/tests/test_tracer.py
@@ -234,6 +234,7 @@ def forward(x: torch.Tensor) -> torch.Tensor:
                 exir.CaptureConfig(
                     enable_functionalization=False,
                     enable_dynamic_shape=True,
+                    _dynamo_config=ExirDynamoConfig(assume_static_by_default=True),
                 ),
                 # sym_size is not reg op
             )
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index dc467ad856..76069eca92 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -18,7 +18,7 @@ add_subdirectory(
         ${EXECUTORCH_ROOT}/examples/third-party/fbjni
         ${CMAKE_CURRENT_BINARY_DIR}/third-party/fbjni)
 
-if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*ios\.toolchain\.cmake$")
+if(CMAKE_TOOLCHAIN_ANDROID)
     add_library(executorch_jni SHARED jni/jni_layer.cpp)
     target_link_libraries(executorch_jni extension_data_loader
         extension_module xnn_executor_runner_lib fbjni)
@@ -26,4 +26,10 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*ios\.toolchain\.cmake$")
         target_link_libraries(executorch_jni qnn_executorch_backend)
     endif()
     target_compile_options(executorch_jni PUBLIC ${_common_compile_options})
+
+    add_library(executorch_llama_jni SHARED jni/jni_layer_llama.cpp)
+    target_link_libraries(executorch_llama_jni fbjni llama_runner
+                          portable_ops_lib)
+    target_compile_options(executorch_llama_jni PUBLIC
+                           ${_common_compile_options})
 endif()
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index 5f7158c742..4089a7e0ff 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -260,14 +260,14 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
         torch::executor::Module::MlockConfig::NoMlock);
   }
 
-  facebook::jni::local_ref<JEValue> forward(
+  facebook::jni::local_ref<facebook::jni::JArrayClass<JEValue>> forward(
       facebook::jni::alias_ref<
           facebook::jni::JArrayClass<JEValue::javaobject>::javaobject>
           jinputs) {
     return execute_method("forward", jinputs);
   }
 
-  facebook::jni::local_ref<JEValue> execute(
+  facebook::jni::local_ref<facebook::jni::JArrayClass<JEValue>> execute(
       facebook::jni::alias_ref<jstring> methodName,
       facebook::jni::alias_ref<
           facebook::jni::JArrayClass<JEValue::javaobject>::javaobject>
@@ -275,7 +275,11 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
     return execute_method(methodName->toStdString(), jinputs);
   }
 
-  facebook::jni::local_ref<JEValue> execute_method(
+  jint load_method(facebook::jni::alias_ref<jstring> methodName) {
+    return static_cast<jint>(module_->load_method(methodName->toStdString()));
+  }
+
+  facebook::jni::local_ref<facebook::jni::JArrayClass<JEValue>> execute_method(
       std::string method,
       facebook::jni::alias_ref<
           facebook::jni::JArrayClass<JEValue::javaobject>::javaobject>
@@ -327,7 +331,15 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
         static_cast<error_code_t>(result.error()));
     ET_LOG(Info, "Model executed successfully.");
 
-    return JEValue::newJEValueFromEValue(result.get()[0]);
+    facebook::jni::local_ref<facebook::jni::JArrayClass<JEValue>> jresult =
+        facebook::jni::JArrayClass<JEValue>::newArray(result.get().size());
+
+    for (int i = 0; i < result.get().size(); i++) {
+      auto jevalue = JEValue::newJEValueFromEValue(result.get()[i]);
+      jresult->setElement(i, *jevalue);
+    }
+
+    return jresult;
   }
 
   static void registerNatives() {
@@ -335,6 +347,7 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
         makeNativeMethod("initHybrid", ExecuTorchJni::initHybrid),
         makeNativeMethod("forward", ExecuTorchJni::forward),
         makeNativeMethod("execute", ExecuTorchJni::execute),
+        makeNativeMethod("loadMethod", ExecuTorchJni::load_method),
     });
   }
 };
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index d2ee0cb794..b570db8d51 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -23,6 +23,33 @@
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 
+#ifdef __ANDROID__
+#include <android/log.h>
+
+// For Android, write to logcat
+void et_pal_emit_log_message(
+    et_timestamp_t timestamp,
+    et_pal_log_level_t level,
+    const char* filename,
+    const char* function,
+    size_t line,
+    const char* message,
+    size_t length) {
+  int android_log_level = ANDROID_LOG_UNKNOWN;
+  if (level == 'D') {
+    android_log_level = ANDROID_LOG_DEBUG;
+  } else if (level == 'I') {
+    android_log_level = ANDROID_LOG_INFO;
+  } else if (level == 'E') {
+    android_log_level = ANDROID_LOG_ERROR;
+  } else if (level == 'F') {
+    android_log_level = ANDROID_LOG_FATAL;
+  }
+
+  __android_log_print(android_log_level, "LLAMA", "%s", message);
+}
+#endif
+
 using namespace torch::executor;
 
 namespace executorch_jni {
@@ -84,11 +111,16 @@ class ExecuTorchLlamaJni
     runner_->stop();
   }
 
+  jint load() {
+    return static_cast<jint>(runner_->load());
+  }
+
   static void registerNatives() {
     registerHybrid({
         makeNativeMethod("initHybrid", ExecuTorchLlamaJni::initHybrid),
         makeNativeMethod("generate", ExecuTorchLlamaJni::generate),
         makeNativeMethod("stop", ExecuTorchLlamaJni::stop),
+        makeNativeMethod("load", ExecuTorchLlamaJni::load),
     });
   }
 };
diff --git a/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java
index 976e732d3f..0878f15143 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java
@@ -14,8 +14,15 @@ interface INativePeer {
   void resetNative();
 
   /** Run a "forward" call with the given inputs */
-  EValue forward(EValue... inputs);
+  EValue[] forward(EValue... inputs);
 
   /** Run an arbitrary method on the module */
-  EValue execute(String methodName, EValue... inputs);
+  EValue[] execute(String methodName, EValue... inputs);
+
+  /**
+   * Load a method on this module.
+   *
+   * @return the Error code if there was an error loading the method
+   */
+  int loadMethod(String methodName);
 }
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
index 24887b6e42..f73f550d5a 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
@@ -40,4 +40,7 @@ public void resetNative() {
 
   @DoNotStrip
   public native void stop();
+
+  @DoNotStrip
+  public native int load();
 }
diff --git a/extension/android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/src/main/java/org/pytorch/executorch/Module.java
index 3f1546d009..af4c238ff9 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/Module.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/Module.java
@@ -52,7 +52,7 @@ public static Module load(final String modelPath) {
    * @param inputs arguments for the ExecuTorch module's 'forward' method.
    * @return return value from the 'forward' method.
    */
-  public EValue forward(EValue... inputs) {
+  public EValue[] forward(EValue... inputs) {
     return mNativePeer.forward(inputs);
   }
 
@@ -63,10 +63,23 @@ public EValue forward(EValue... inputs) {
    * @param inputs arguments that will be passed to ExecuTorch method.
    * @return return value from the method.
    */
-  public EValue execute(String methodName, EValue... inputs) {
+  public EValue[] execute(String methodName, EValue... inputs) {
     return mNativePeer.execute(methodName, inputs);
   }
 
+  /**
+   * Load a method on this module. This might help with the first time inference performance,
+   * because otherwise the method is loaded lazily when it's execute. Note: this function is
+   * synchronous, and will block until the method is loaded. Therefore, it is recommended to call
+   * this on a background thread. However, users need to make sure that they don't execute before
+   * this function returns.
+   *
+   * @return the Error code if there was an error loading the method
+   */
+  public int loadMethod(String methodName) {
+    return mNativePeer.loadMethod(methodName);
+  }
+
   /**
    * Explicitly destroys the native torch::jit::Module. Calling this method is not required, as the
    * native object will be destroyed when this object is garbage-collected. However, the timing of
diff --git a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
index 029c173ee9..dc513200d2 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
@@ -34,8 +34,11 @@ public void resetNative() {
   }
 
   @DoNotStrip
-  public native EValue forward(EValue... inputs);
+  public native EValue[] forward(EValue... inputs);
 
   @DoNotStrip
-  public native EValue execute(String methodName, EValue... inputs);
+  public native EValue[] execute(String methodName, EValue... inputs);
+
+  @DoNotStrip
+  public native int loadMethod(String methodName);
 }
diff --git a/extension/apple/CMakeLists.txt b/extension/apple/CMakeLists.txt
new file mode 100644
index 0000000000..db194270d3
--- /dev/null
+++ b/extension/apple/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format --first-comment-is-literal=True CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+add_library(extension_apple)
+
+set(EXPORTED_SOURCES
+  ExecuTorch/Exported/ExecuTorchLog.mm
+)
+
+target_sources(
+  extension_apple PRIVATE
+  ${EXPORTED_SOURCES}
+)
+
+target_include_directories(
+  extension_apple PUBLIC
+  ExecuTorch/Exported
+)
+
+find_library(FOUNDATION_FRAMEWORK Foundation)
+target_link_libraries(extension_apple
+  PRIVATE
+  executorch
+  ${FOUNDATION_FRAMEWORK}
+)
+
+target_compile_options(extension_apple PUBLIC ${_common_compile_options})
+target_compile_options(extension_apple PRIVATE "-fobjc-arc")
+target_compile_options(extension_apple PRIVATE "-fno-exceptions")
+target_compile_options(extension_apple PRIVATE "-fno-rtti")
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorch.h b/extension/apple/ExecuTorch/Exported/ExecuTorch.h
new file mode 100644
index 0000000000..e16439714f
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorch.h
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchLog.h"
diff --git a/extension/apple/ExecuTorch/ExecuTorchLog.h b/extension/apple/ExecuTorch/Exported/ExecuTorchLog.h
similarity index 92%
rename from extension/apple/ExecuTorch/ExecuTorchLog.h
rename to extension/apple/ExecuTorch/Exported/ExecuTorchLog.h
index 9b6e77ff96..a71591c7ba 100644
--- a/extension/apple/ExecuTorch/ExecuTorchLog.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchLog.h
@@ -59,14 +59,14 @@ NS_SWIFT_NAME(Log)
  *
  * @param sink The log sink to add.
  */
-- (void)addSink:(id<ExecuTorchLogSink>)sink;
+- (void)addSink:(id<ExecuTorchLogSink>)sink NS_SWIFT_NAME(add(sink:));
 
 /**
  * Removes a previously added log sink.
  *
  * @param sink The log sink to remove.
  */
-- (void)removeSink:(id<ExecuTorchLogSink>)sink;
+- (void)removeSink:(id<ExecuTorchLogSink>)sink NS_SWIFT_NAME(remove(sink:));
 
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
diff --git a/extension/apple/ExecuTorch/ExecuTorchLog.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchLog.mm
similarity index 50%
rename from extension/apple/ExecuTorch/ExecuTorchLog.mm
rename to extension/apple/ExecuTorch/Exported/ExecuTorchLog.mm
index 668428bc1c..15f5aa1727 100644
--- a/extension/apple/ExecuTorch/ExecuTorchLog.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchLog.mm
@@ -8,6 +8,9 @@
 
 #import "ExecuTorchLog.h"
 
+#import <os/log.h>
+
+#import <executorch/runtime/platform/log.h>
 #import <executorch/runtime/platform/platform.h>
 
 @interface ExecuTorchLog ()
@@ -21,32 +24,53 @@ - (void)logWithLevel:(ExecuTorchLogLevel)level
 @end
 
 @implementation ExecuTorchLog {
+#ifdef ET_LOG_ENABLED
   NSHashTable<id<ExecuTorchLogSink>> *_sinks;
   dispatch_queue_t _queue;
+  NSMutableArray<NSDictionary *> *_buffer;
+#endif
 }
 
 + (instancetype)sharedLog {
   static ExecuTorchLog *sharedLog;
   static dispatch_once_t onceToken;
   dispatch_once(&onceToken, ^{
-    sharedLog = [[self alloc] init];
+    sharedLog = [self new];
+#if ET_LOG_ENABLED
     sharedLog->_sinks = [NSHashTable weakObjectsHashTable];
     sharedLog->_queue = dispatch_queue_create("org.pytorch.executorch.log",
                                               DISPATCH_QUEUE_SERIAL);
+    sharedLog->_buffer = [NSMutableArray new];
+#endif
   });
   return sharedLog;
 }
 
 - (void)addSink:(id<ExecuTorchLogSink>)sink {
+#if ET_LOG_ENABLED
   dispatch_async(_queue, ^{
     [self->_sinks addObject:sink];
+    for (NSDictionary *log in self->_buffer) {
+      [sink logWithLevel:(ExecuTorchLogLevel)[log[@"level"] integerValue]
+               timestamp:[log[@"timestamp"] doubleValue]
+                filename:log[@"filename"] ?: @""
+                    line:[log[@"line"] unsignedIntegerValue]
+                 message:log[@"message"] ?: @""];
+    }
   });
+#else
+  (void)sink;
+#endif
 }
 
 - (void)removeSink:(id<ExecuTorchLogSink>)sink {
+#if ET_LOG_ENABLED
   dispatch_async(_queue, ^{
     [self->_sinks removeObject:sink];
   });
+#else
+  (void)sink;
+#endif
 }
 
 #pragma mark - Private
@@ -56,9 +80,20 @@ - (void)logWithLevel:(ExecuTorchLogLevel)level
             filename:(NSString *)filename
                 line:(NSUInteger)line
              message:(NSString *)message {
+#if ET_LOG_ENABLED
   NSHashTable<id<ExecuTorchLogSink>> __block *sinks;
   dispatch_sync(_queue, ^{
     sinks = [self->_sinks copy];
+    if (self->_buffer.count >= 100) {
+      [self->_buffer removeObjectAtIndex:0];
+    }
+    [self->_buffer addObject:@{
+      @"level" : @(level),
+      @"timestamp" : @(timestamp),
+      @"filename" : filename,
+      @"line" : @(line),
+      @"message" : message
+    }];
   });
   for (id<ExecuTorchLogSink> sink in sinks) {
     [sink logWithLevel:level
@@ -67,6 +102,13 @@ - (void)logWithLevel:(ExecuTorchLogLevel)level
                   line:line
                message:message];
   }
+#else
+  (void)level;
+  (void)timestamp;
+  (void)filename;
+  (void)line;
+  (void)message;
+#endif
 }
 
 @end
@@ -78,10 +120,56 @@ void et_pal_emit_log_message(et_timestamp_t timestamp,
                              size_t line,
                              const char *__nonnull message,
                              __ET_UNUSED size_t length) {
+#if ET_LOG_ENABLED
+  NSTimeInterval timeInterval = timestamp / 1000000000.0;
+  NSUInteger totalSeconds = (NSUInteger)timeInterval;
+  NSUInteger hours = (totalSeconds / 3600) % 24;
+  NSUInteger minutes = (totalSeconds / 60) % 60;
+  NSUInteger seconds = totalSeconds % 60;
+  NSUInteger microseconds = (timestamp - totalSeconds) * 1000000;
+  NSString *formattedMessage = [NSString
+      stringWithFormat:@"%c %02lu:%02lu:%02lu.%06lu executorch:%s:%zu] %s",
+                       (char)level,
+                       hours,
+                       minutes,
+                       seconds,
+                       microseconds,
+                       filename,
+                       line,
+                       message];
+  os_log_type_t logType = OS_LOG_TYPE_DEFAULT;
+  switch (level) {
+  case kDebug:
+    logType = OS_LOG_TYPE_DEBUG;
+    break;
+  case kInfo:
+    logType = OS_LOG_TYPE_INFO;
+    break;
+  case kError:
+    logType = OS_LOG_TYPE_ERROR;
+    break;
+  case kFatal:
+    logType = OS_LOG_TYPE_FAULT;
+    break;
+  default:
+    logType = OS_LOG_TYPE_DEFAULT;
+    break;
+  }
+  os_log_with_type(OS_LOG_DEFAULT, logType, "%{public}@", formattedMessage);
+
   [ExecuTorchLog.sharedLog
       logWithLevel:(ExecuTorchLogLevel)level
-         timestamp:timestamp / 1000000000.0
+         timestamp:timeInterval
           filename:[NSString stringWithUTF8String:filename]
               line:(NSUInteger)line
            message:[NSString stringWithUTF8String:message]];
+#else
+  (void)timestamp;
+  (void)level;
+  (void)filename;
+  (void)function;
+  (void)line;
+  (void)message;
+  (void)length;
+#endif
 }
diff --git a/extension/apple/ExecuTorch/Exported/module.modulemap b/extension/apple/ExecuTorch/Exported/module.modulemap
new file mode 100644
index 0000000000..cb91d5d4d6
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/module.modulemap
@@ -0,0 +1,6 @@
+module ExecuTorch {
+  umbrella header "ExecuTorch.h"
+
+  export *
+  module * { export * }
+}
\ No newline at end of file
diff --git a/extension/apple/TARGETS b/extension/apple/TARGETS
new file mode 100644
index 0000000000..5c4f482b5e
--- /dev/null
+++ b/extension/apple/TARGETS
@@ -0,0 +1 @@
+# This file needs to exist to avoid build system breakage, see https://fburl.com/workplace/jtdlgdmd
diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h
new file mode 100644
index 0000000000..fcbdec0965
--- /dev/null
+++ b/extension/aten_util/make_aten_functor_from_et_functor.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//===----------------------------------------------------------------------===//
+/// \file runtime/kernel/make_aten_functor_from_et_functor.h
+/// Defines a template that can be used to create a ATen version of an unboxed
+/// ExecuTorch kernel.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <type_traits>
+#if __cplusplus < 201703L
+#error "This header requires C++17"
+#endif
+#include <ATen/native/Resize.h>
+#include <executorch/extension/kernel_util/type_list.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/runtime/core/evalue.h>
+#include <torch/torch.h>
+
+namespace torch {
+namespace executor {
+
+class KernelRuntimeContext; // Forward declaration
+using RuntimeContext = KernelRuntimeContext; // TODO(T147221312): Remove
+
+template <typename T>
+struct type_map final {
+  using type = T;
+};
+
+template <>
+struct type_map<torch::executor::Tensor&> final {
+  using type = at::Tensor&;
+};
+
+template <>
+struct type_map<const torch::executor::Tensor&> final {
+  using type = const at::Tensor&;
+};
+
+template <typename F, typename T, typename Enable = void>
+struct type_convert final {
+ public:
+  F val;
+  explicit type_convert(F value) : val(value) {}
+  T call() {
+    return static_cast<T>(val);
+  }
+};
+
+template <typename T>
+struct remove_const_ref final {
+  using type = std::remove_const_t<std::remove_reference_t<T>>;
+};
+
+template <class ATensor, class ETensor>
+struct type_convert<
+    ATensor,
+    ETensor,
+    std::enable_if_t<
+        std::is_same_v<typename remove_const_ref<ATensor>::type, at::Tensor> &&
+        std::is_same_v<
+            typename remove_const_ref<ETensor>::type,
+            torch::executor::Tensor>>>
+    final {
+ public:
+  ATensor val;
+  std::unique_ptr<ManagedTensor> managed_tensor;
+  torch::executor::Tensor converted;
+  std::vector<exec_aten::SizesType> sizes;
+  explicit type_convert(ATensor value)
+      : val(value), converted(torch::executor::Tensor(nullptr)) {
+    for (auto size : val.sizes()) {
+      sizes.push_back(size);
+    }
+    torch::executor::ScalarType scalar_type =
+        static_cast<torch::executor::ScalarType>(val.scalar_type());
+    managed_tensor = std::make_unique<ManagedTensor>(
+        val.mutable_data_ptr(), val.numel(), sizes, scalar_type);
+    converted = managed_tensor->get_aliasing_tensor();
+  }
+  ETensor call() {
+    return converted;
+  }
+};
+
+template <>
+struct type_convert<torch::executor::Tensor&, at::Tensor&> final {
+ public:
+  torch::executor::Tensor& val;
+  at::Tensor converted;
+  std::vector<int64_t> sizes;
+  explicit type_convert(torch::executor::Tensor& value) : val(value) {
+    for (auto size : val.sizes()) {
+      sizes.push_back(size);
+    }
+    c10::ScalarType scalar_type =
+        static_cast<c10::ScalarType>(val.scalar_type());
+    converted =
+        at::from_blob(val.mutable_data_ptr(), val.numel(), sizes, scalar_type);
+  }
+  at::Tensor& call() {
+    return converted;
+  }
+};
+
+template <class F, F f, typename N = int, N index = N(-1)>
+struct wrapper_impl;
+
+template <class R, class... Args, R (*f)(Args...), int N>
+struct wrapper_impl<R (*)(Args...), f, int, N> {
+  static_assert(
+      !(std::is_same<R, at::Tensor&>::value && N == -1),
+      "Can't wrap a kernel with 'Tensor &' return type without specifying an index to the out tensor");
+  using ReturnType = typename type_map<R>::type;
+  using TupleConvertsType =
+      std::tuple<type_convert<typename type_map<Args>::type, Args>...>;
+  using TupleArgsType = std::tuple<typename type_map<Args>::type...>;
+  static constexpr size_t num_args = sizeof...(Args);
+  static_assert(
+      (N < num_args && std::is_same_v<element_t<N, typelist<Args...>>, R>) ||
+          N == -1,
+      "The index of the out tensor can't be greater or equal to num_args and "
+      "the Nth argument type has to be the same as the return type.");
+
+  static ReturnType wrap(typename type_map<Args>::type... args) {
+    // The wrapped function that takes ATen argument types, convert them into
+    // ExecuTorch equivalent, call `f` then return the result converted back to
+    // ATen.
+    TupleArgsType args_tuple = std::forward_as_tuple(args...);
+    TupleConvertsType converts = std::forward_as_tuple(
+        type_convert<typename type_map<Args>::type, Args>(args)...);
+    R result =
+        call_functor_with_args(converts, std::make_index_sequence<num_args>());
+    typename std::remove_reference<ReturnType>::type converted_result =
+        type_convert<R, ReturnType>(result).call();
+    if constexpr (N == -1) {
+      return converted_result;
+    } else {
+      static_assert(
+          std::is_same_v<
+              typename std::remove_reference<ReturnType>::type,
+              at::Tensor>,
+          "Only support at::Tensor-like return");
+      ReturnType out = std::get<N>(args_tuple);
+      at::native::resize_output(out, converted_result.sizes());
+      out.copy_(converted_result);
+      return out;
+    }
+  }
+
+ private:
+  template <size_t... indices>
+  static R call_functor_with_args(
+      TupleConvertsType& converts,
+      std::index_sequence<indices...>) {
+    return f(std::get<indices>(converts).call()...);
+  }
+};
+
+} // namespace executor
+} // namespace torch
+
+// Wrapper macro for out variant function. N is the index of the out tensor.
+// We need N to know how to preserve the semantics of modifying out tensor and
+// return the reference without allocating a new memory buffer for out tensor.
+#define _WRAP_2(func, N) \
+  ::torch::executor::wrapper_impl<decltype(&func), func, decltype(N), N>::wrap
+#define _WRAP_1(func) \
+  ::torch::executor::wrapper_impl<decltype(&func), func>::wrap
+
+#define GET_MACRO(_1, _2, NAME, ...) NAME
+#define WRAP_TO_ATEN(...) GET_MACRO(__VA_ARGS__, _WRAP_2, _WRAP_1)(__VA_ARGS__)
diff --git a/extension/aten_util/targets.bzl b/extension/aten_util/targets.bzl
index a351b7d5a2..b396cb7832 100644
--- a/extension/aten_util/targets.bzl
+++ b/extension/aten_util/targets.bzl
@@ -10,7 +10,7 @@ def define_common_targets():
     runtime.cxx_library(
         name = "aten_bridge",
         srcs = ["aten_bridge.cpp"],
-        exported_headers = ["aten_bridge.h"],
+        exported_headers = ["aten_bridge.h", "make_aten_functor_from_et_functor.h"],
         compiler_flags = [
             "-frtti",
             "-fno-omit-frame-pointer",
@@ -25,8 +25,10 @@ def define_common_targets():
             "//executorch/...",
             "@EXECUTORCH_CLIENTS",
         ],
-        deps = [
+        exported_deps = [
+            "//executorch/extension/kernel_util:kernel_util",
             "//executorch/runtime/core:core",
+            "//executorch/runtime/core:evalue",
             "//executorch/runtime/core/exec_aten:lib",
         ],
         external_deps = [
diff --git a/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
new file mode 100644
index 0000000000..bf6b60fe63
--- /dev/null
+++ b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/tensor.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+#include <torch/library.h>
+
+namespace torch {
+namespace executor {
+
+using namespace ::testing;
+
+Tensor& my_op_out(const Tensor& a, Tensor& out) {
+  (void)a;
+  return out;
+}
+
+Tensor& add_1_out(const Tensor& a, Tensor& out) {
+  (void)a;
+  out.mutable_data_ptr<int32_t>()[0] += 1;
+  return out;
+}
+
+Tensor& quantized_embedding_byte_out(
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const Tensor& weight_zero_points,
+    int64_t weight_quant_min,
+    int64_t weight_quant_max,
+    const Tensor& indices,
+    Tensor& out) {
+  (void)weight;
+  (void)weight_scales;
+  (void)weight_zero_points;
+  (void)weight_quant_min;
+  (void)indices;
+  out.mutable_data_ptr<int32_t>()[0] -= static_cast<int32_t>(weight_quant_max);
+  return out;
+}
+
+class MakeATenFunctorFromETFunctorTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    torch::executor::runtime_init();
+  }
+};
+
+TEST_F(MakeATenFunctorFromETFunctorTest, Basic) {
+  auto function = WRAP_TO_ATEN(my_op_out, 1);
+  at::Tensor a = torch::tensor({1.0f});
+  at::Tensor b = torch::tensor({2.0f});
+  at::Tensor c = function(a, b);
+  EXPECT_EQ(c.const_data_ptr<float>()[0], 2.0f);
+}
+
+TORCH_LIBRARY(my_op, m) {
+  m.def("add_1.out", WRAP_TO_ATEN(add_1_out, 1));
+  m.def(
+      "embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
+      WRAP_TO_ATEN(quantized_embedding_byte_out, 6));
+};
+
+TEST_F(MakeATenFunctorFromETFunctorTest, RegisterWrappedFunction) {
+  auto op = c10::Dispatcher::singleton().findSchema({"my_op::add_1", "out"});
+  EXPECT_TRUE(op.has_value());
+  at::Tensor a =
+      torch::tensor({1}, torch::TensorOptions().dtype(torch::kInt32));
+  at::Tensor b =
+      torch::tensor({2}, torch::TensorOptions().dtype(torch::kInt32));
+  torch::jit::Stack stack = {a, b};
+  op.value().callBoxed(&stack);
+  EXPECT_EQ(stack.size(), 1);
+  EXPECT_EQ(stack[0].toTensor().const_data_ptr<int32_t>()[0], 3);
+}
+
+TEST_F(MakeATenFunctorFromETFunctorTest, TestEmbeddingByte) {
+  auto op =
+      c10::Dispatcher::singleton().findSchema({"my_op::embedding_byte", "out"});
+  EXPECT_TRUE(op.has_value());
+  at::Tensor weight =
+      torch::tensor({1}, torch::TensorOptions().dtype(torch::kInt32));
+  at::Tensor scale =
+      torch::tensor({2}, torch::TensorOptions().dtype(torch::kInt32));
+  at::Tensor zero_point =
+      torch::tensor({2}, torch::TensorOptions().dtype(torch::kInt32));
+  at::Tensor indices =
+      torch::tensor({2}, torch::TensorOptions().dtype(torch::kInt32));
+  at::Tensor out =
+      torch::tensor({4}, torch::TensorOptions().dtype(torch::kInt32));
+  torch::jit::Stack stack = {weight, scale, zero_point, 0, 1, indices, out};
+  op.value().callBoxed(&stack);
+  EXPECT_EQ(stack.size(), 1);
+  EXPECT_EQ(stack[0].toTensor().const_data_ptr<int32_t>()[0], 3);
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/extension/aten_util/test/targets.bzl b/extension/aten_util/test/targets.bzl
index 6257595e35..58c00a9031 100644
--- a/extension/aten_util/test/targets.bzl
+++ b/extension/aten_util/test/targets.bzl
@@ -9,22 +9,19 @@ def define_common_targets():
 
     runtime.cxx_test(
         name = "aten_bridge_test",
-        srcs = ["aten_bridge_test.cpp"],
+        srcs = [
+            "aten_bridge_test.cpp",
+            "make_aten_functor_from_et_functor_test.cpp",
+        ],
         deps = [
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/kernel:operator_registry",
             "//executorch/extension/aten_util:aten_bridge",
+            "//executorch/extension/runner_util:managed_tensor",
         ],
-        fbcode_deps = [
-            "//caffe2:ATen-core",
-            "//caffe2:ATen-cpu",
-            "//caffe2/c10:c10",
-        ],
-        xplat_deps = [
-            "//xplat/caffe2:torch_mobile_core",
-            "//xplat/caffe2/c10:c10",
-            # Dont really like this but without this I dont have aten::empty
-            # And havent figured out a more minimal target
-            "//xplat/caffe2:torch_mobile_all_ops_et",
+        external_deps = [
+            "libtorch",
+            "gtest_aten",
         ],
     )
diff --git a/extension/kernel_util/README.md b/extension/kernel_util/README.md
new file mode 100644
index 0000000000..a3a1e653bd
--- /dev/null
+++ b/extension/kernel_util/README.md
@@ -0,0 +1,23 @@
+This header file `make_boxed_from_unboxed_functor.h` defines a template that can be used to create a boxed version of an unboxed functor. It is part of the executorch extension in the torch namespace.
+## Requirements
+This header requires C++17 or later.
+## Usage
+The template takes an unboxed function pointer and wraps it into a functor that takes `RuntimeContext` and `EValues` as inputs and returns void. The wrapped functor will unbox all inputs and forward them to the unboxed kernel.
+Here is an example of how to use the template:
+```C++
+Tensor& my_op(RuntimeContext& ctx, const Tensor& self, const Tensor& other, Tensor& out) {
+  // ...
+  return out;
+}
+Kernel my_kernel = Kernel::make_boxed_kernel("my_ns::my_op", EXECUTORCH_FN(my_op));
+static auto res = register_kernels({my_kernel});
+```
+Alternatively, you can use the EXECUTORCH_LIBRARY macro to simplify the process:
+```C++
+EXECUTORCH_LIBRARY(my_ns, "my_op", my_op);
+```
+## Details
+The template uses a lot of C++17 features to convert each EValue to the inferred argument type. It checks if the first argument is `RuntimeContext`, and if so, it removes it. The call method of the `WrapUnboxedIntoFunctor` struct calls the unboxed function with the corresponding arguments.
+The `EXECUTORCH_LIBRARY` macro registers the kernel for the operation and stores the result in a static variable.
+## Note
+The `RuntimeContext` is a placeholder for a context that will be passed to kernels. It is currently empty, but it is planned to be used for kernel temp memory allocation and error handling in the future.
diff --git a/extension/kernel_util/TARGETS b/extension/kernel_util/TARGETS
new file mode 100644
index 0000000000..2341af9282
--- /dev/null
+++ b/extension/kernel_util/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/kernel_util/make_boxed_from_unboxed_functor.h b/extension/kernel_util/make_boxed_from_unboxed_functor.h
new file mode 100644
index 0000000000..266521bd90
--- /dev/null
+++ b/extension/kernel_util/make_boxed_from_unboxed_functor.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//===----------------------------------------------------------------------===//
+/// \file extension/kernel_util/make_boxed_from_unboxed_functor.h
+/// Defines a template that can be used to create a boxed version of an unboxed
+/// functor.
+/// Example usage:
+/// ```
+/// Tensor&
+/// my_op(RuntimeContext& ctx, const Tensor& self, const Tensor& other, Tensor&
+/// out) {
+///   // ...
+///   return out;
+/// }
+///
+/// Kernel my_kernel = Kernel::make_boxed_kernel("my_ns::my_op",
+///   EXECUTORCH_FN(my_op));
+/// static auto res = register_kernels({my_kernel});
+/// ```
+/// Or simply:
+/// ```
+/// EXECUTORCH_LIBRARY(my_ns, "my_op", my_op);
+/// ```
+///
+/// The trick here is to convert each EValue to inferred argument type. This
+/// uses a lot of C++17 features.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#if __cplusplus < 201703L
+#error "This header requires C++17"
+#endif
+
+#include <executorch/extension/kernel_util/meta_programming.h>
+#include <executorch/extension/kernel_util/type_list.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/operator_registry.h>
+#include <cstdlib>
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+
+namespace torch {
+namespace executor {
+
+class KernelRuntimeContext; // Forward declaration
+using RuntimeContext = KernelRuntimeContext; // TODO(T147221312): Remove
+
+// evalue_to_arg
+template <class T>
+struct decay_if_not_tensor final {
+  using type = std::decay_t<T>;
+};
+template <>
+struct decay_if_not_tensor<exec_aten::Tensor&> final {
+  using type = exec_aten::Tensor&;
+};
+template <>
+struct decay_if_not_tensor<const exec_aten::Tensor&> final {
+  using type = const exec_aten::Tensor&;
+};
+
+template <class T>
+struct evalue_to_arg final {
+  static T call(EValue& v) {
+    return std::move(v).to<T>();
+  }
+};
+
+template <>
+struct evalue_to_arg<exec_aten::Tensor&> final {
+  static exec_aten::Tensor& call(EValue& v) {
+    return v.toTensor();
+  }
+};
+
+template <>
+struct evalue_to_arg<const exec_aten::Tensor&> final {
+  static const exec_aten::Tensor& call(EValue& v) {
+    return v.toTensor();
+  }
+};
+// Call functor with args from stack
+
+template <class Functor, size_t... evalue_arg_indices, typename... ArgTypes>
+void call_functor_with_args_from_stack_(
+    RuntimeContext& ctx,
+    EValue** stack,
+    std::index_sequence<evalue_arg_indices...>,
+    typelist<ArgTypes...>*) {
+  (*Functor::func_ptr())(
+      ctx,
+      evalue_to_arg<typename decay_if_not_tensor<ArgTypes>::type>::call(
+          *stack[evalue_arg_indices])...);
+}
+
+/**
+ * WrapUnboxedIntoFunctor: Given a function pointer, wrap it into a functor that
+ * takes EValues as input and returns void. The wrapped functor will unbox all
+ * inputs and forward them to unboxed kernel.
+ */
+template <class FuncType>
+struct WrapUnboxedIntoFunctor {
+  static_assert(
+      is_compile_time_function_pointer<FuncType>::value,
+      "Can't handle function other than EXECUTORCH_FN");
+  using TrueType = typename FuncType::FuncType;
+  using ReturnType = typename infer_function_traits_t<TrueType>::return_type;
+  using ArgsType = typename infer_function_traits_t<TrueType>::parameter_types;
+  // check if the first argument is RuntimeContext, if so, remove it
+  static constexpr bool first_arg_is_context = std::is_same<
+      RuntimeContext,
+      std::remove_reference_t<head_with_default_t<void, ArgsType>>>::value;
+  using ContextRemovedArgsType = std::conditional_t<
+      first_arg_is_context,
+      drop_if_nonempty_t<ArgsType, 1>,
+      ArgsType>;
+
+  static void call(RuntimeContext& ctx, EValue** stack) {
+    constexpr size_t num_inputs = size<ContextRemovedArgsType>::value;
+    return call_functor_with_args_from_stack_<FuncType>(
+        ctx,
+        stack,
+        std::make_index_sequence<num_inputs>(),
+        static_cast<ContextRemovedArgsType*>(nullptr));
+  }
+};
+
+template <typename FuncType>
+static Kernel make_boxed_kernel(const char* name, FuncType) {
+  return Kernel(name, WrapUnboxedIntoFunctor<FuncType>::call);
+}
+
+} // namespace executor
+} // namespace torch
+
+#define EXECUTORCH_LIBRARY(ns, op_name, func)                 \
+  static auto res_##ns = ::torch::executor::register_kernels( \
+      ::torch::executor::make_boxed_kernel(                   \
+          #ns "::" op_name, EXECUTORCH_FN(func)))
diff --git a/extension/kernel_util/meta_programming.h b/extension/kernel_util/meta_programming.h
new file mode 100644
index 0000000000..46262b843e
--- /dev/null
+++ b/extension/kernel_util/meta_programming.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#if __cplusplus < 201703L
+#error "This header requires C++17"
+#endif
+
+#include <executorch/extension/kernel_util/type_list.h>
+#include <cstdlib>
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+
+namespace torch {
+namespace executor {
+
+// Check if a given type is a function
+template <class T>
+struct is_function_type : std::false_type {};
+template <class Result, class... Args>
+struct is_function_type<Result(Args...)> : std::true_type {};
+template <class T>
+using is_function_type_t = typename is_function_type<T>::type;
+
+// A compile-time wrapper around a function pointer
+template <class FuncType_, FuncType_* func_ptr_>
+struct CompileTimeFunctionPointer final {
+  static_assert(
+      is_function_type<FuncType_>::value,
+      "EXECUTORCH_FN can only wrap function types.");
+  using FuncType = FuncType_;
+
+  static constexpr FuncType* func_ptr() {
+    return func_ptr_;
+  }
+};
+
+// Check if a given type is a compile-time function pointer
+template <class T>
+struct is_compile_time_function_pointer : std::false_type {};
+template <class FuncType, FuncType* func_ptr>
+struct is_compile_time_function_pointer<
+    CompileTimeFunctionPointer<FuncType, func_ptr>> : std::true_type {};
+
+#define EXECUTORCH_FN_TYPE(func)                                      \
+  CompileTimeFunctionPointer<                                         \
+      std::remove_pointer_t<std::remove_reference_t<decltype(func)>>, \
+      func>
+#define EXECUTORCH_FN(func) EXECUTORCH_FN_TYPE(func)()
+
+/**
+ * strip_class: helper to remove the class type from pointers to `operator()`.
+ */
+template <typename T>
+struct strip_class {};
+template <typename Class, typename Result, typename... Args>
+struct strip_class<Result (Class::*)(Args...)> {
+  using type = Result(Args...);
+};
+template <typename Class, typename Result, typename... Args>
+struct strip_class<Result (Class::*)(Args...) const> {
+  using type = Result(Args...);
+};
+template <typename T>
+using strip_class_t = typename strip_class<T>::type;
+
+/**
+ * Access information about result type or arguments from a function type.
+ * Example:
+ * using A = function_traits<int (float, double)>::return_type // A == int
+ * using A = function_traits<int (float, double)>::parameter_types::tuple_type
+ * // A == tuple<float, double>
+ */
+template <class Func>
+struct function_traits {
+  static_assert(
+      !std::is_same<Func, Func>::value,
+      "In function_traits<Func>, Func must be a plain function type.");
+};
+template <class Result, class... Args>
+struct function_traits<Result(Args...)> {
+  using func_type = Result(Args...);
+  using return_type = Result;
+  using parameter_types = typelist<Args...>;
+  static constexpr auto number_of_parameters = sizeof...(Args);
+};
+
+/**
+ * infer_function_traits: creates a `function_traits` type for a simple
+ * function (pointer) or functor (lambda/struct). Currently does not support
+ * class methods.
+ */
+template <typename Functor>
+struct infer_function_traits {
+  using type = function_traits<strip_class_t<decltype(&Functor::operator())>>;
+};
+template <typename Result, typename... Args>
+struct infer_function_traits<Result (*)(Args...)> {
+  using type = function_traits<Result(Args...)>;
+};
+template <typename Result, typename... Args>
+struct infer_function_traits<Result(Args...)> {
+  using type = function_traits<Result(Args...)>;
+};
+template <typename T>
+using infer_function_traits_t = typename infer_function_traits<T>::type;
+
+} // namespace executor
+} // namespace torch
diff --git a/extension/kernel_util/targets.bzl b/extension/kernel_util/targets.bzl
new file mode 100644
index 0000000000..81d4da10d1
--- /dev/null
+++ b/extension/kernel_util/targets.bzl
@@ -0,0 +1,29 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_library(
+        name = "kernel_util",
+        srcs = [],
+        exported_headers = [
+            "make_boxed_from_unboxed_functor.h",
+            "meta_programming.h",
+            "type_list.h",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core:evalue",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/runtime/kernel:kernel_runtime_context",
+            "//executorch/runtime/kernel:operator_registry",
+        ],
+    )
diff --git a/extension/kernel_util/test/TARGETS b/extension/kernel_util/test/TARGETS
new file mode 100644
index 0000000000..2341af9282
--- /dev/null
+++ b/extension/kernel_util/test/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
new file mode 100644
index 0000000000..8bc534ca32
--- /dev/null
+++ b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/runtime/core/portable_type/tensor.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/kernel/operator_registry.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using RuntimeContext = torch::executor::KernelRuntimeContext;
+using namespace torch::executor;
+
+Tensor& my_op_out(RuntimeContext& ctx, const Tensor& a, Tensor& out) {
+  (void)ctx;
+  (void)a;
+  return out;
+}
+
+Tensor& set_1_out(RuntimeContext& ctx, Tensor& out) {
+  (void)ctx;
+  out.mutable_data_ptr<int32_t>()[0] = 1;
+  return out;
+}
+
+class MakeBoxedFromUnboxedFunctorTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    torch::executor::runtime_init();
+  }
+};
+
+TEST_F(MakeBoxedFromUnboxedFunctorTest, Basic) {
+  EXECUTORCH_LIBRARY(my_ns, "my_op.out", my_op_out);
+  EXPECT_TRUE(hasOpsFn("my_ns::my_op.out"));
+}
+
+TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) {
+  EXECUTORCH_LIBRARY(my_ns, "set_1.out", set_1_out);
+  EXPECT_TRUE(hasOpsFn("my_ns::set_1.out"));
+
+  // prepare out tensor
+  TensorImpl::SizesType sizes[1] = {5};
+  TensorImpl::DimOrderType dim_order[1] = {0};
+  int32_t data[5] = {0, 0, 0, 0, 0};
+  auto a_impl = TensorImpl(ScalarType::Int, 1, sizes, data, dim_order, nullptr);
+  auto a = Tensor(&a_impl);
+
+  // get boxed callable
+  auto fn = getOpsFn("my_ns::set_1.out");
+
+  // run it
+  RuntimeContext context;
+  EValue values[1];
+  values[0] = a;
+  EValue* stack[1];
+  stack[0] = &values[0];
+
+  fn(context, stack);
+
+  // check result
+  EXPECT_EQ(a.const_data_ptr<int32_t>()[0], 1);
+}
diff --git a/extension/kernel_util/test/targets.bzl b/extension/kernel_util/test/targets.bzl
new file mode 100644
index 0000000000..122d4392c6
--- /dev/null
+++ b/extension/kernel_util/test/targets.bzl
@@ -0,0 +1,17 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    runtime.cxx_test(
+        name = "make_boxed_from_unboxed_functor_test",
+        srcs = [
+            "make_boxed_from_unboxed_functor_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/kernel_util:kernel_util",
+        ],
+    )
diff --git a/extension/kernel_util/type_list.h b/extension/kernel_util/type_list.h
new file mode 100644
index 0000000000..f832ab9f26
--- /dev/null
+++ b/extension/kernel_util/type_list.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+///
+/// \file runtime/kernel/type_list.h
+/// Forked from pytorch/c10/util/TypeList.h
+/// \brief Utilities for working with type lists.
+#pragma once
+#if __cplusplus < 201703L
+#error "This header requires C++17"
+#endif
+
+#include <algorithm>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+namespace executor {
+/**
+ * Type holding a list of types for compile time type computations
+ *     constexpr size_t num = size<typelist<int, double>>::value;
+ *     static_assert(num == 2, "");
+ */
+template <class... T>
+struct false_t : std::false_type {};
+
+template <class... Items>
+struct typelist final {
+ public:
+  typelist() = delete; // not for instantiation
+};
+template <class TypeList>
+struct size final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::size<T>, T must be typelist<...>.");
+};
+template <class... Types>
+struct size<typelist<Types...>> final {
+  static constexpr size_t value = sizeof...(Types);
+};
+
+/**
+ * is_instantiation_of<T, I> is true_type iff I is a template instantiation of T
+ * (e.g. vector<int> is an instantiation of vector) Example:
+ *    is_instantiation_of_t<vector, vector<int>> // true
+ *    is_instantiation_of_t<pair, pair<int, string>> // true
+ *    is_instantiation_of_t<vector, pair<int, string>> // false
+ */
+template <template <class...> class Template, class T>
+struct is_instantiation_of : std::false_type {};
+template <template <class...> class Template, class... Args>
+struct is_instantiation_of<Template, Template<Args...>> : std::true_type {};
+template <template <class...> class Template, class T>
+using is_instantiation_of_t = typename is_instantiation_of<Template, T>::type;
+
+/// Base template.
+template <size_t Index, class TypeList>
+struct element final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::element<T>, the T argument must be typelist<...>.");
+};
+/// Successful case, we have reached the zero index and can "return" the head
+/// type.
+template <class Head, class... Tail>
+struct element<0, typelist<Head, Tail...>> {
+  using type = Head;
+};
+/// Error case, we have an index but ran out of types! It will only be selected
+/// if `Ts...` is actually empty!
+template <size_t Index, class... Ts>
+struct element<Index, typelist<Ts...>> {
+  static_assert(
+      Index < sizeof...(Ts),
+      "Index is out of bounds in typelist::element");
+};
+/// Shave off types until we hit the <0, Head, Tail...> or <Index> case.
+template <size_t Index, class Head, class... Tail>
+struct element<Index, typelist<Head, Tail...>>
+    : element<Index - 1, typelist<Tail...>> {};
+/// Convenience alias.
+template <size_t Index, class TypeList>
+using element_t = typename element<Index, TypeList>::type;
+
+/**
+ * Returns the first element of a type list, or the specified default if the
+ * type list is empty. Example: int  ==  head_t<bool, typelist<int, string>>
+ *   bool  ==  head_t<bool, typelist<>>
+ */
+template <class Default, class TypeList>
+struct head_with_default final {
+  using type = Default;
+};
+template <class Default, class Head, class... Tail>
+struct head_with_default<Default, typelist<Head, Tail...>> final {
+  using type = Head;
+};
+template <class Default, class TypeList>
+using head_with_default_t = typename head_with_default<Default, TypeList>::type;
+
+/**
+ * Take/drop a number of arguments from a typelist.
+ * Example:
+ *   typelist<int, string> == take_t<typelist<int, string, bool>, 2>
+ *   typelist<bool> == drop_t<typelist<int, string, bool>, 2>
+ */
+template <class TypeList, size_t offset, class IndexSequence>
+struct take_elements final {};
+template <class TypeList, size_t offset, size_t... Indices>
+struct take_elements<TypeList, offset, std::index_sequence<Indices...>> final {
+  using type = typelist<typename element<offset + Indices, TypeList>::type...>;
+};
+
+/**
+ * Like drop, but returns an empty list rather than an assertion error if `num`
+ * is larger than the size of the TypeList.
+ * Example:
+ *   typelist<> == drop_if_nonempty_t<typelist<string, bool>, 2>
+ *   typelist<> == drop_if_nonempty_t<typelist<int, string, bool>, 3>
+ */
+template <class TypeList, size_t num>
+struct drop_if_nonempty final {
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::drop<T, num>, the T argument must be typelist<...>.");
+  using type = typename take_elements<
+      TypeList,
+      std::min(num, size<TypeList>::value),
+      std::make_index_sequence<
+          size<TypeList>::value - std::min(num, size<TypeList>::value)>>::type;
+};
+template <class TypeList, size_t num>
+using drop_if_nonempty_t = typename drop_if_nonempty<TypeList, num>::type;
+
+} // namespace executor
+} // namespace torch
diff --git a/extension/module/CMakeLists.txt b/extension/module/CMakeLists.txt
index af6200cc7a..e0d7ccc250 100644
--- a/extension/module/CMakeLists.txt
+++ b/extension/module/CMakeLists.txt
@@ -17,13 +17,14 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 list(TRANSFORM _extension_module__srcs PREPEND "${EXECUTORCH_ROOT}/")
-if(CMAKE_TOOLCHAIN_IOS)
+if(CMAKE_TOOLCHAIN_IOS OR CMAKE_TOOLCHAIN_ANDROID)
   # Building a share library on iOS requires code signing
+  # On Android we see duplicated registration when using shared lib
   add_library(extension_module STATIC ${_extension_module__srcs})
 else()
   add_library(extension_module SHARED ${_extension_module__srcs})
 endif()
-target_link_libraries(extension_module PRIVATE executorch)
+target_link_libraries(extension_module PRIVATE executorch extension_data_loader)
 target_include_directories(extension_module PUBLIC ${EXECUTORCH_ROOT}/..)
 target_compile_options(extension_module PUBLIC -Wno-deprecated-declarations
                                                -fPIC)
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index 2e72b25b8b..2bbf47745e 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -13,7 +13,7 @@ def define_common_targets():
             "module_test.cpp",
         ],
         deps = [
-            "//executorch/kernels/portable:generated_lib_all_ops",
+            "//executorch/kernels/portable:generated_lib",
             "//executorch/extension/module:module",
         ],
         env = {
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
index 46029d9d5e..e4603aa08e 100644
--- a/extension/parallel/targets.bzl
+++ b/extension/parallel/targets.bzl
@@ -24,5 +24,6 @@ def define_common_targets():
             deps = [
                 "//executorch/backends/xnnpack/threadpool:threadpool",
                 "//executorch/runtime/core:core",
+                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
             ],
         )
diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl
index ad2e3feb5f..791c072747 100644
--- a/extension/parallel/test/targets.bzl
+++ b/extension/parallel/test/targets.bzl
@@ -14,5 +14,6 @@ def define_common_targets():
         ],
         deps = [
             "//executorch/extension/parallel:thread_parallel",
+            "//executorch/runtime/platform:platform",
         ],
     )
diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/parallel/test/thread_parallel_test.cpp
index 9e45523937..1eea87beb0 100644
--- a/extension/parallel/test/thread_parallel_test.cpp
+++ b/extension/parallel/test/thread_parallel_test.cpp
@@ -12,7 +12,7 @@
 #include <mutex>
 
 #include <executorch/extension/parallel/thread_parallel.h>
-#include <executorch/test/utils/DeathTest.h>
+#include <executorch/runtime/platform/platform.h>
 
 using namespace ::testing;
 
@@ -49,9 +49,9 @@ class ParallelTest : public ::testing::Test {
 };
 
 TEST_F(ParallelTest, TestAllInvoked) {
-  parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
-  });
+  }));
 
   for (int64_t i = 0; i < 10; ++i) {
     EXPECT_EQ(data_[i], i);
@@ -59,9 +59,9 @@ TEST_F(ParallelTest, TestAllInvoked) {
 }
 
 TEST_F(ParallelTest, TestAllInvokedWithMutex) {
-  parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
-  });
+  }));
 
   int expected_sum = 0;
   for (int64_t i = 0; i < 10; ++i) {
@@ -72,13 +72,10 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) {
 }
 
 TEST_F(ParallelTest, TestInvalidRange) {
-  ET_EXPECT_DEATH(
-      {
-        parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) {
-          this->RunExclusiveTask(begin, end);
-        });
-      },
-      "");
+  et_pal_init();
+  EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) {
+    this->RunExclusiveTask(begin, end);
+  }));
 
   for (int64_t i = 0; i < 10; ++i) {
     EXPECT_EQ(data_[i], 0);
@@ -87,13 +84,10 @@ TEST_F(ParallelTest, TestInvalidRange) {
 }
 
 TEST_F(ParallelTest, TestInvalidRange2) {
-  ET_EXPECT_DEATH(
-      {
-        parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) {
-          this->RunExclusiveTask(begin, end);
-        });
-      },
-      "");
+  et_pal_init();
+  EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) {
+    this->RunExclusiveTask(begin, end);
+  }));
 
   for (int64_t i = 0; i < 10; ++i) {
     EXPECT_EQ(data_[i], 0);
@@ -102,9 +96,9 @@ TEST_F(ParallelTest, TestInvalidRange2) {
 }
 
 TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
-  parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
-  });
+  }));
 
   for (int64_t i = 0; i < 5; ++i) {
     EXPECT_EQ(data_[i], i);
@@ -115,9 +109,9 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
 }
 
 TEST_F(ParallelTest, TestInvokePartialToEnd) {
-  parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
-  });
+  }));
 
   for (int64_t i = 0; i < 5; ++i) {
     EXPECT_EQ(data_[i], 0);
@@ -128,9 +122,9 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) {
 }
 
 TEST_F(ParallelTest, TestInvokePartialMiddle) {
-  parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
-  });
+  }));
 
   for (int64_t i = 0; i < 2; ++i) {
     EXPECT_EQ(data_[i], 0);
@@ -144,9 +138,9 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) {
 }
 
 TEST_F(ParallelTest, TestChunkSize2) {
-  parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
-  });
+  }));
 
   for (int64_t i = 0; i < 10; ++i) {
     EXPECT_EQ(data_[i], i);
@@ -154,9 +148,9 @@ TEST_F(ParallelTest, TestChunkSize2) {
 }
 
 TEST_F(ParallelTest, TestChunkSize2Middle) {
-  parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
-  });
+  }));
 
   for (int64_t i = 0; i < 3; ++i) {
     EXPECT_EQ(data_[i], 0);
@@ -170,9 +164,9 @@ TEST_F(ParallelTest, TestChunkSize2Middle) {
 }
 
 TEST_F(ParallelTest, TestChunkSize3) {
-  parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
-  });
+  }));
 
   for (int64_t i = 0; i < 10; ++i) {
     EXPECT_EQ(data_[i], i);
@@ -180,9 +174,9 @@ TEST_F(ParallelTest, TestChunkSize3) {
 }
 
 TEST_F(ParallelTest, TestChunkSize6) {
-  parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
-  });
+  }));
 
   for (int64_t i = 0; i < 10; ++i) {
     EXPECT_EQ(data_[i], i);
@@ -190,9 +184,9 @@ TEST_F(ParallelTest, TestChunkSize6) {
 }
 
 TEST_F(ParallelTest, TestChunkSizeTooLarge) {
-  parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) {
+  EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
-  });
+  }));
 
   for (int64_t i = 0; i < 10; ++i) {
     EXPECT_EQ(data_[i], i);
diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp
index 14ee87b98e..c765c1c9e0 100644
--- a/extension/parallel/thread_parallel.cpp
+++ b/extension/parallel/thread_parallel.cpp
@@ -10,6 +10,7 @@
 
 #include <executorch/backends/xnnpack/threadpool/threadpool.h>
 #include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace torch::executor {
@@ -34,14 +35,14 @@ calc_num_tasks_and_chunk_size(int64_t begin, int64_t end, int64_t grain_size) {
   return std::make_tuple(num_tasks, chunk_size);
 }
 
-void parallel_for(
+bool parallel_for(
     const int64_t begin,
     const int64_t end,
     const int64_t grain_size,
     const std::function<void(int64_t, int64_t)>& f) {
-  ET_CHECK_MSG(begin >= 0 && end >= 0, "Begin and end should be non-negative");
-  ET_CHECK_MSG(end >= begin, "end should be greater than or equal to begin");
-  ET_CHECK_MSG(grain_size > 0, "grain_size should be positive");
+  ET_LOG_AND_RETURN_IF_FALSE(begin >= 0 && end >= 0);
+  ET_LOG_AND_RETURN_IF_FALSE(end >= begin);
+  ET_LOG_AND_RETURN_IF_FALSE(grain_size > 0);
   int64_t num_tasks = 0, chunk_size = 0;
   std::tie(num_tasks, chunk_size) =
       calc_num_tasks_and_chunk_size(begin, end, grain_size);
@@ -57,6 +58,7 @@ void parallel_for(
   // Per protocol from threadpool (pthreadpool), when this returns, all tasks
   // are executed, so this is synchronous.
   get_threadpool()->run(task, num_tasks);
+  return true;
 }
 
 } // namespace torch::executor
diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h
index ccfec1da66..e7caa2f3d6 100644
--- a/extension/parallel/thread_parallel.h
+++ b/extension/parallel/thread_parallel.h
@@ -23,12 +23,13 @@ namespace torch::executor {
  * described below
  * f: user function applied in parallel to the chunks, signature:
  *   void f(int64_t begin, int64_t end)
+ * Returns true if all work items are processed successfully, false otherwise
  *
  * Warning: parallel_for does NOT copy thread local states from the current
  * thread to the worker threads. Users need to protect the access to captured
  * data if they mutate them in f.
  */
-void parallel_for(
+bool parallel_for(
     const int64_t begin,
     const int64_t end,
     const int64_t grain_size,
diff --git a/extension/pybindings/README.md b/extension/pybindings/README.md
index a590f6a72a..8c1adf2c22 100644
--- a/extension/pybindings/README.md
+++ b/extension/pybindings/README.md
@@ -2,7 +2,30 @@
 This Python module, named `portable_lib`, provides a set of functions and classes for loading and executing bundled programs. To install it, run the fullowing command:
 
 ```bash
-EXECUTORCH_BUILD_PYBIND=ON pip install . --no-build-isolation
+EXECUTORCH_BUILD_PYBIND=ON \
+pip install . --no-build-isolation
+```
+
+Or when installing the rest of dependencies:
+
+```bash
+install_requirements.sh --pybind
+```
+
+# Link Backends
+
+You can link the runtime against some backends to make sure a delegated or partitioned model can still run by Python module successfully:
+
+```bash
+EXECUTORCH_BUILD_PYBIND=ON \
+CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON" \
+pip install . --no-build-isolation
+```
+
+Similarly, when installing the rest of dependencies:
+
+```bash
+install_requirements.sh --pybind coreml mps xnnpack
 ```
 
 ## Functions
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index eb34abda44..439a1505bd 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -461,13 +461,13 @@ struct PyModule final {
         input_tensors.emplace_back(
             type,
             dim,
-            input_sizes[i].data(),
+            input_sizes.back().data(),
             nullptr,
-            input_dim_order[i].data(),
-            input_strides[i].data());
+            input_dim_order.back().data(),
+            input_strides.back().data());
 
         torch::executor::Tensor temp =
-            torch::executor::Tensor(&input_tensors[i]);
+            torch::executor::Tensor(&input_tensors.back());
         torch::util::alias_etensor_to_attensor(at_tensor, temp);
         EValue evalue(temp);
 #endif
diff --git a/extension/runner_util/managed_tensor.h b/extension/runner_util/managed_tensor.h
index 63a2008125..a3e20ce7ef 100644
--- a/extension/runner_util/managed_tensor.h
+++ b/extension/runner_util/managed_tensor.h
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #ifdef USE_ATEN_LIB
 #include <torch/torch.h>
diff --git a/install_requirements.sh b/install_requirements.sh
index 773d258322..257e81c066 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -17,6 +17,30 @@ then
   PYTHON_EXECUTABLE=python3
 fi
 
+# Parse options.
+EXECUTORCH_BUILD_PYBIND=OFF
+CMAKE_ARGS=""
+
+for arg in "$@"; do
+  case $arg in
+    --pybind)
+      EXECUTORCH_BUILD_PYBIND=ON
+      ;;
+    coreml|mps|xnnpack)
+      if [[ "$EXECUTORCH_BUILD_PYBIND" == "ON" ]]; then
+        CMAKE_ARGS="$CMAKE_ARGS -DEXECUTORCH_BUILD_${arg^^}=ON"
+      else
+        echo "Error: $arg must follow --pybind"
+        exit 1
+      fi
+      ;;
+    *)
+      echo "Error: Unknown option $arg"
+      exit 1
+      ;;
+  esac
+done
+
 # Install pytorch dependencies
 #
 # Note:
@@ -25,7 +49,7 @@ fi
 # models in executorch/examples/models.
 # The version in this file will be the correct version for the
 # corresponsing version of the repo.
-NIGHTLY_VERSION=dev20240229
+NIGHTLY_VERSION=dev20240312
 
 TORCH_VERSION=2.3.0.${NIGHTLY_VERSION}
 pip install --force-reinstall --pre torch=="${TORCH_VERSION}" -i https://download.pytorch.org/whl/nightly/cpu
@@ -39,14 +63,17 @@ pip install --force-reinstall --pre torchaudio=="${TORCH_AUDIO_VERSION}" -i http
 TIMM_VERSION=0.6.13
 pip install --pre timm==${TIMM_VERSION}
 
-TRANSFORMERS_VERSION=4.34.0
+TRANSFORMERS_VERSION=4.38.2
 pip install --force-reinstall --pre transformers==${TRANSFORMERS_VERSION}
 
 TORCHSR_VERSION=1.0.4
 pip install --pre torchsr==${TORCHSR_VERSION}
 
 # Install ExecuTorch after dependencies are installed.
-pip install . --no-build-isolation
+EXECUTORCH_BUILD_PYBIND="$EXECUTORCH_BUILD_PYBIND" \
+CMAKE_ARGS="$CMAKE_ARGS" \
+CMAKE_BUILD_PARALLEL_LEVEL=9 \
+pip install . --no-build-isolation -v
 
 # Install flatc dependency
 bash build/install_flatc.sh
diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp
index 7a1ed0ef4a..c11c9977fe 100644
--- a/kernels/optimized/cpu/op_add.cpp
+++ b/kernels/optimized/cpu/op_add.cpp
@@ -36,11 +36,17 @@ Tensor& opt_add_out(
       a_type != ScalarType::Half) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
-    ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
 
     ET_SWITCH_REALB_TYPES(a_type, ctx, "add.out", CTYPE, [&]() {
       CTYPE alpha_val;
-      ET_EXTRACT_SCALAR(alpha, alpha_val);
+      ET_KERNEL_CHECK(
+          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
 
       using Vec = executorch::vec::Vectorized<CTYPE>;
       executorch::vec::map2<CTYPE>(
@@ -53,7 +59,7 @@ Tensor& opt_add_out(
   } else {
     ScalarType common_type =
         promoteTypes(a_type, b_type, /*half_to_float*/ true);
-    ET_CHECK(canCast(common_type, out_type));
+    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
 
     ET_KERNEL_CHECK(
         ctx,
@@ -66,7 +72,10 @@ Tensor& opt_add_out(
         ET_SWITCH_REALB_TYPES(common_type, ctx, "add.out", CTYPE_IN, [&]() {
           ET_SWITCH_REALHB_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() {
             CTYPE_IN alpha_val;
-            ET_EXTRACT_SCALAR(alpha, alpha_val);
+            ET_KERNEL_CHECK(
+                ctx,
+                utils::extract_scalar(alpha, &alpha_val),
+                InvalidArgument, );
 
             apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
                 [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp
index 48f15b85d9..a32cd089fb 100644
--- a/kernels/optimized/cpu/op_bmm.cpp
+++ b/kernels/optimized/cpu/op_bmm.cpp
@@ -28,47 +28,51 @@ using Tensor = exec_aten::Tensor;
 
 namespace {
 
-// Asserts that the parameters are valid.
-void check_bmm_out_args(const Tensor& self, const Tensor& mat2, Tensor& out) {
+// Verifies that the parameters are valid.
+bool check_bmm_out_args(const Tensor& self, const Tensor& mat2, Tensor& out) {
   // Ensure dimensions is 3 for all input and out
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       self.dim() == mat2.dim(),
       "self.dim() %zd != mat2.dim() %zd",
       self.dim(),
       mat2.dim());
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       self.dim() == out.dim(),
       "self.dim() %zd != out.dim() %zd",
       self.dim(),
       out.dim());
-  ET_CHECK_MSG(self.dim() == 3, "self.dim() %zd != 3", self.dim());
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      self.dim() == 3, "self.dim() %zd != 3", self.dim());
   // Ensure batch larger than or equals to 0
-  ET_CHECK_MSG(self.size(0) >= 0, "self.size(0) %zd < 0", self.size(0));
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      self.size(0) >= 0, "self.size(0) %zd < 0", self.size(0));
   // Ensure batches are the same
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       self.size(0) == mat2.size(0),
       "self.size(0) %zd != mat2.size(0) %zd",
       self.size(0),
       mat2.size(0));
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       self.size(0) == out.size(0),
       "self.size(0) %zd != out.size(0) %zd",
       self.size(0),
       out.size(0));
   // Ensure the out size is compatible with input tensors
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       mat2.size(2) == out.size(2),
       "mat2.size(2) %zd != out.size(2) %zd",
       mat2.size(2),
       out.size(2));
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       self.size(1) == out.size(1),
       "self.size(1) %zd != out.size(1) %zd",
       self.size(1),
       out.size(1));
 
   // Ensure that all tensors share a dtype
-  ET_CHECK_SAME_DTYPE3(self, mat2, out);
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(self, mat2, out));
+
+  return true;
 }
 
 template <typename CTYPE>
@@ -106,7 +110,7 @@ void bmm_kernel(const Tensor& self, const Tensor& mat2, Tensor& out) {
   }
 }
 
-void resize_out_tensor(const Tensor& self, const Tensor& mat2, Tensor& out) {
+Error resize_out_tensor(const Tensor& self, const Tensor& mat2, Tensor& out) {
   exec_aten::SizesType expected_output_size[kTensorDimensionLimit];
 
   const size_t m_dim = self.dim() - 2;
@@ -116,16 +120,18 @@ void resize_out_tensor(const Tensor& self, const Tensor& mat2, Tensor& out) {
     expected_output_size[i] = self.size(i);
   }
 
+  if (m_dim >= self.dim() || n_dim >= mat2.dim()) {
+    ET_LOG(Error, "Incompatible matrix multiply dimensions.");
+    return Error::InvalidArgument;
+  }
+
   expected_output_size[m_dim] = self.size(m_dim);
   expected_output_size[n_dim] = mat2.size(n_dim);
 
   ArrayRef<exec_aten::SizesType> output_size{
       expected_output_size, static_cast<size_t>(out.dim())};
 
-  torch::executor::Error err = resize_tensor(out, output_size);
-  ET_CHECK_MSG(
-      err == torch::executor::Error::Ok,
-      "Failed to resize out Tensor in bmm_out");
+  return resize_tensor(out, output_size);
 }
 } // namespace
 
@@ -136,8 +142,14 @@ Tensor& opt_bmm_out(
     const Tensor& mat2,
     Tensor& out) {
   (void)context;
-  resize_out_tensor(self, mat2, out);
-  check_bmm_out_args(self, mat2, out);
+
+  ET_KERNEL_CHECK(
+      context,
+      resize_out_tensor(self, mat2, out) == Error::Ok,
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      context, check_bmm_out_args(self, mat2, out), InvalidArgument, out);
 
 #define BMM_TENSOR(ctype, dtype)        \
   case ScalarType::dtype:               \
diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp
index 76160ae231..cdd156910b 100644
--- a/kernels/optimized/cpu/op_div.cpp
+++ b/kernels/optimized/cpu/op_div.cpp
@@ -51,7 +51,12 @@ Tensor& opt_div_out(
   if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes())) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
-    ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
 
     ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "div.out", CTYPE, [&]() {
       using Vec = executorch::vec::Vectorized<CTYPE>;
@@ -64,7 +69,7 @@ Tensor& opt_div_out(
     });
   } else {
     ScalarType common_type = get_compute_type(a_type, b_type);
-    ET_CHECK(canCast(common_type, out_type));
+    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
 
     ET_KERNEL_CHECK(
         ctx,
diff --git a/kernels/optimized/cpu/op_exp.cpp b/kernels/optimized/cpu/op_exp.cpp
index 49b7395733..cec80b7605 100644
--- a/kernels/optimized/cpu/op_exp.cpp
+++ b/kernels/optimized/cpu/op_exp.cpp
@@ -67,7 +67,14 @@ Tensor& opt_exp_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 
   // Resize for dynamic shape
   auto error = resize_tensor(out, in.sizes());
-  ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      error == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
 
   ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, "exp.out", CTYPE_IN, [&] {
     ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, "exp.out", CTYPE_OUT, [&] {
diff --git a/kernels/optimized/cpu/op_gelu.cpp b/kernels/optimized/cpu/op_gelu.cpp
index bbd20310c0..6cf92fdc11 100644
--- a/kernels/optimized/cpu/op_gelu.cpp
+++ b/kernels/optimized/cpu/op_gelu.cpp
@@ -37,7 +37,11 @@ namespace {
  * matches the dtype of the tensors.
  */
 template <typename CTYPE>
-void gelu(const Tensor& input, string_view approximate, Tensor& output) {
+void gelu(
+    exec_aten::RuntimeContext& context,
+    const Tensor& input,
+    string_view approximate,
+    Tensor& output) {
   const CTYPE* in_data = input.data_ptr<CTYPE>();
   CTYPE* out_data = output.data_ptr<CTYPE>();
   size_t lim = input.numel();
@@ -85,8 +89,11 @@ void gelu(const Tensor& input, string_view approximate, Tensor& output) {
 #endif // __aarch64__
 
   } else {
-    ET_CHECK_MSG(
+    ET_KERNEL_CHECK_MSG(
+        context,
         false,
+        InvalidArgument,
+        ,
         "Invalid approximation format: %.*s for gelu",
         static_cast<int>(approximate.length()),
         approximate.data());
@@ -108,20 +115,27 @@ Tensor& opt_gelu_out(
     string_view approximate,
     Tensor& out) {
   (void)context;
-  ET_CHECK_SAME_SHAPE_AND_DTYPE2(input, out);
+  ET_KERNEL_CHECK(
+      context,
+      tensors_have_same_shape_and_dtype(input, out),
+      InvalidArgument,
+      out);
 
 // helper for generating the cases for different data types
-#define GELU(ctype, dtype)                \
-  case ScalarType::dtype:                 \
-    gelu<ctype>(input, approximate, out); \
+#define GELU(ctype, dtype)                         \
+  case ScalarType::dtype:                          \
+    gelu<ctype>(context, input, approximate, out); \
     break;
 
   switch (input.scalar_type()) {
     // TODO support Double as well
     GELU(float, Float)
     default:
-      ET_CHECK_MSG(
+      ET_KERNEL_CHECK_MSG(
+          context,
           false,
+          InvalidArgument,
+          out,
           "Unhandled dtype %" PRId8,
           static_cast<int8_t>(input.scalar_type()));
   }
diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 84b6c396f7..05e7889671 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -26,11 +26,16 @@ Tensor& opt_le_tensor_out(
     Tensor& out) {
   (void)ctx;
 
-  ET_CHECK_SAME_SHAPE2(a, b);
+  ET_KERNEL_CHECK(ctx, tensors_have_same_shape(a, b), InvalidArgument, out);
 
   // Resize for dynamic shape
   auto error = resize_tensor(out, a.sizes());
-  ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      error == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
@@ -90,7 +95,12 @@ Tensor& opt_le_scalar_out(
 
   // Resize for dynamic shape
   auto error = resize_tensor(out, a.sizes());
-  ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      error == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
index af0df6a170..d0a3cda866 100644
--- a/kernels/optimized/cpu/op_log_softmax.cpp
+++ b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -132,10 +132,17 @@ Tensor& opt_log_softmax_out(
     Tensor& out) {
   (void)context;
 
-  check_log_softmax_args(self, dim, half_to_float, out);
+  ET_KERNEL_CHECK(
+      context,
+      check_log_softmax_args(self, dim, half_to_float, out),
+      InvalidArgument,
+      out);
 
   ET_KERNEL_CHECK(
-      ctx, resize_tensor(out, self.sizes()) == Error::Ok, InvalidArgument, out);
+      context,
+      resize_tensor(out, self.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
 
   dim = dim < 0 ? dim + nonzero_dim(self) : dim;
 
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
index 2c19163ee1..3600f16fec 100644
--- a/kernels/optimized/cpu/op_mul.cpp
+++ b/kernels/optimized/cpu/op_mul.cpp
@@ -35,7 +35,12 @@ Tensor& opt_mul_out(
       a_type != ScalarType::Half) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
-    ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
 
     ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
       using Vec = executorch::vec::Vectorized<CTYPE>;
@@ -49,7 +54,7 @@ Tensor& opt_mul_out(
   } else {
     ScalarType common_type =
         promoteTypes(a_type, b_type, /*half_to_float*/ true);
-    ET_CHECK(canCast(common_type, out_type));
+    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
 
     ET_KERNEL_CHECK(
         ctx,
diff --git a/kernels/optimized/cpu/op_neg.cpp b/kernels/optimized/cpu/op_neg.cpp
index 2b5fdca3a4..861c7aebee 100644
--- a/kernels/optimized/cpu/op_neg.cpp
+++ b/kernels/optimized/cpu/op_neg.cpp
@@ -19,7 +19,12 @@ Tensor& opt_neg_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 
   // Resize for dynamic shape
   auto error = resize_tensor(out, in.sizes());
-  ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      error == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
 
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "neg.out", CTYPE, [&] {
     using Vec = executorch::vec::Vectorized<CTYPE>;
diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
index 3e1d9544eb..77917c0eda 100644
--- a/kernels/optimized/cpu/op_sub.cpp
+++ b/kernels/optimized/cpu/op_sub.cpp
@@ -10,6 +10,7 @@
 #include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
@@ -32,15 +33,23 @@ Tensor& opt_sub_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
+  ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
+
   if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes()) &&
       a_type != ScalarType::Half) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
-    ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
 
     ET_SWITCH_REAL_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
       CTYPE alpha_val;
-      ET_EXTRACT_SCALAR(alpha, alpha_val);
+      ET_KERNEL_CHECK(
+          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
 
       using Vec = executorch::vec::Vectorized<CTYPE>;
       executorch::vec::map2<CTYPE>(
@@ -53,7 +62,7 @@ Tensor& opt_sub_out(
   } else {
     ScalarType common_type =
         promoteTypes(a_type, b_type, /*half_to_float*/ true);
-    ET_CHECK(canCast(common_type, out_type));
+    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
 
     ET_KERNEL_CHECK(
         ctx,
@@ -66,7 +75,10 @@ Tensor& opt_sub_out(
         ET_SWITCH_REAL_TYPES(common_type, ctx, "sub.out", CTYPE_IN, [&]() {
           ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() {
             CTYPE_IN alpha_val;
-            ET_EXTRACT_SCALAR(alpha, alpha_val);
+            ET_KERNEL_CHECK(
+                ctx,
+                utils::extract_scalar(alpha, &alpha_val),
+                InvalidArgument, );
 
             apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
                 [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index 57d7c05ea7..a532cfc7ba 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -29,6 +29,8 @@ Tensor& add_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType alpha_type = utils::get_scalar_dtype(alpha);
@@ -81,6 +83,8 @@ Tensor& add_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType alpha_type = utils::get_scalar_dtype(alpha);
diff --git a/kernels/portable/cpu/op_clone.cpp b/kernels/portable/cpu/op_clone.cpp
index 18b74049c8..a49f4169db 100644
--- a/kernels/portable/cpu/op_clone.cpp
+++ b/kernels/portable/cpu/op_clone.cpp
@@ -19,11 +19,11 @@ using Tensor = exec_aten::Tensor;
 // clone.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out)
 // -> Tensor(a!)
 Tensor& clone_out(
-    RuntimeContext& ctx,
+    RuntimeContext& context,
     const Tensor& self,
     exec_aten::optional<exec_aten::MemoryFormat> memory_format,
     Tensor& out) {
-  (void)ctx;
+  (void)context;
 
   ET_KERNEL_CHECK(
       context,
diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp
index bb53537deb..8abf6f9722 100644
--- a/kernels/portable/cpu/op_copy.cpp
+++ b/kernels/portable/cpu/op_copy.cpp
@@ -29,12 +29,12 @@ Tensor& copy_out(
     Tensor& out) {
   (void)ctx;
   // Right now we only support blocking data transfer
-  ET_KERNEL_CHECK(ctx, non_blocking == false, InvalidArgument, non_blocking);
+  ET_KERNEL_CHECK(ctx, non_blocking == false, InvalidArgument, out);
 
   ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
 
   ET_KERNEL_CHECK(
-      ctx, tensor_is_broadcastable_to(src, in), InvalidArgument, src);
+      ctx, tensor_is_broadcastable_to(src, in), InvalidArgument, out);
 
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
@@ -57,6 +57,33 @@ Tensor& copy_out(
   return out;
 }
 
+Tensor&
+copy_(RuntimeContext& ctx, Tensor& in, const Tensor& src, bool non_blocking) {
+  (void)ctx;
+  // Right now we only support blocking data transfer
+  ET_KERNEL_CHECK(ctx, non_blocking == false, InvalidArgument, in);
+
+  ET_KERNEL_CHECK(
+      ctx, tensor_is_broadcastable_to(src, in), InvalidArgument, in);
+
+  ScalarType in_type = in.scalar_type();
+  ScalarType src_type = src.scalar_type();
+
+  ET_SWITCH_REAL_TYPES_AND(Bool, in_type, ctx, "copy_", CTYPE, [&]() {
+    ET_SWITCH_REAL_TYPES_AND(Bool, src_type, ctx, "copy_", CTYPE_SRC, [&]() {
+      apply_binary_elementwise_fn<CTYPE, CTYPE_SRC, CTYPE>(
+          [](const CTYPE val_in, const CTYPE_SRC val_src) {
+            return convert<CTYPE, CTYPE_SRC>(val_src);
+          },
+          in,
+          src,
+          in);
+    });
+  });
+
+  return in;
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp
index d2b01cd5e7..e51d816ecb 100644
--- a/kernels/portable/cpu/op_div.cpp
+++ b/kernels/portable/cpu/op_div.cpp
@@ -55,6 +55,8 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
+
   ScalarType common_type = get_compute_type(a_type, b_type);
   ScalarType out_type = out.scalar_type();
 
@@ -100,6 +102,8 @@ Tensor& div_out_mode(
   ScalarType common_type = get_compute_type(a_type, b_type);
   ScalarType out_type = out.scalar_type();
 
+  ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
+
   // Allow casting float -> integral here
   // non-bool -> bool is still disallowed
   ET_KERNEL_CHECK(
diff --git a/kernels/portable/cpu/op_embedding.cpp b/kernels/portable/cpu/op_embedding.cpp
index 70a89e888e..19e915ebc0 100644
--- a/kernels/portable/cpu/op_embedding.cpp
+++ b/kernels/portable/cpu/op_embedding.cpp
@@ -28,6 +28,7 @@ namespace {
 
 template <typename CTYPE>
 void embedding_kernel(
+    RuntimeContext& ctx,
     const Tensor& weight,
     const Tensor& indices,
     Tensor& out) {
@@ -38,14 +39,20 @@ void embedding_kernel(
   ssize_t weight_height = weight.size(0);
   for (int i = 0; i < indices.numel(); i++) {
     // Ensure index is larger than 0 and smaller than weight.size(0)
-    ET_CHECK_MSG(
+    ET_KERNEL_CHECK_MSG(
+        ctx,
         indices_ptr[i] < weight_height,
+        InvalidArgument,
+        ,
         "indices_ptr[%d] %ld >= weight.size(0) %zd",
         i,
         static_cast<long>(indices_ptr[i]),
         weight_height);
-    ET_CHECK_MSG(
+    ET_KERNEL_CHECK_MSG(
+        ctx,
         indices_ptr[i] >= 0,
+        InvalidArgument,
+        ,
         "indices_ptr[%d] %ld < 0",
         i,
         static_cast<long>(indices_ptr[i]));
@@ -101,7 +108,7 @@ Tensor& embedding_out(
 
   ET_SWITCH_TWO_TYPES(
       Long, Int, ix_type, ctx, "op_embedding.out", CTYPE, [&]() {
-        embedding_kernel<CTYPE>(weight, indices, out);
+        embedding_kernel<CTYPE>(ctx, weight, indices, out);
       });
 
   return out;
diff --git a/kernels/portable/cpu/op_empty.cpp b/kernels/portable/cpu/op_empty.cpp
index be2b514177..ed0807b7f4 100644
--- a/kernels/portable/cpu/op_empty.cpp
+++ b/kernels/portable/cpu/op_empty.cpp
@@ -32,7 +32,7 @@ Tensor& empty_out(
 
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
-      ctx,
+      context,
       resize_tensor(out, size) == Error::Ok,
       InvalidArgument,
       out,
diff --git a/kernels/portable/cpu/op_expand_copy.cpp b/kernels/portable/cpu/op_expand_copy.cpp
index 49277e455c..5f0d19adc5 100644
--- a/kernels/portable/cpu/op_expand_copy.cpp
+++ b/kernels/portable/cpu/op_expand_copy.cpp
@@ -90,7 +90,11 @@ Tensor& expand_copy_out(
   const auto repeats_size{map_expand_to_repeats(
       self_sizes, expand_sizes, repeats, kTensorDimensionLimit)};
 
-  repeat_tensor(self, {repeats, repeats_size}, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      repeat_tensor(self, {repeats, repeats_size}, out) == Error::Ok,
+      InvalidArgument,
+      out);
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_fill.cpp b/kernels/portable/cpu/op_fill.cpp
index 66fb842c5a..60ebd5de5a 100644
--- a/kernels/portable/cpu/op_fill.cpp
+++ b/kernels/portable/cpu/op_fill.cpp
@@ -29,7 +29,7 @@ Tensor& fill_scalar_out(
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType out_type = out.scalar_type();
 
-  ET_KERNEL_CHECK(ct, a_type == out_type, InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, a_type == out_type, InvalidArgument, out);
 
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
@@ -65,7 +65,7 @@ Tensor& fill_tensor_out(
   (void)ctx;
 
   // Assert `b` must be a scalar tensor.
-  ET_KERNEL_CHECK(ctx, tensor_is_scalar(b), InvalidArgument, b);
+  ET_KERNEL_CHECK(ctx, tensor_is_scalar(b), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp
index 2cd0b3fc54..261f77ce61 100644
--- a/kernels/portable/cpu/op_floor_divide.cpp
+++ b/kernels/portable/cpu/op_floor_divide.cpp
@@ -31,6 +31,8 @@ Tensor& floor_divide_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type);
diff --git a/kernels/portable/cpu/op_index.cpp b/kernels/portable/cpu/op_index.cpp
index 72d0cbd19a..16817a2719 100644
--- a/kernels/portable/cpu/op_index.cpp
+++ b/kernels/portable/cpu/op_index.cpp
@@ -95,7 +95,7 @@ Tensor& index_Tensor_out(
           bool success = true;
           std::tie(in_ix, success) =
               get_in_ix(in, indices, out, out_ix, start, xdim, dim_map, ix_map);
-          ET_KERNEL_CHECK(ctx, success, InvalidArgument, out);
+          ET_KERNEL_CHECK(ctx, success, InvalidArgument, );
           out_data[out_ix] = in_data[in_ix];
         }
       });
diff --git a/kernels/portable/cpu/op_index_put.cpp b/kernels/portable/cpu/op_index_put.cpp
index ecdaa7263c..59a258eb00 100644
--- a/kernels/portable/cpu/op_index_put.cpp
+++ b/kernels/portable/cpu/op_index_put.cpp
@@ -131,8 +131,7 @@ Tensor& index_put_out(
           ctx,
           get_in_coord(
               in, indices, start, bc_ndim, dim_map, ix_map, x_coord, in_coord),
-          InvalidArgument,
-          out);
+          InvalidArgument, );
 
       in_ix = coordinateToIndex(in, in_coord);
 
diff --git a/kernels/portable/cpu/op_linear_scratch_example.cpp b/kernels/portable/cpu/op_linear_scratch_example.cpp
index a3dc5fc53a..30535e9ee8 100644
--- a/kernels/portable/cpu/op_linear_scratch_example.cpp
+++ b/kernels/portable/cpu/op_linear_scratch_example.cpp
@@ -71,11 +71,10 @@ Tensor& linear_scratch_example(
   N = input.size(1);
   K = weight.size(0);
 
-  ET_KERNEL_CHECK(
-      ctx,
-      check_linear_scratch_example_args(input, weight, bias, out, scratch),
-      InvalidArgument,
-      out);
+  // TODO: Update to use ET_KERNEL_CHECK when context is available in custom
+  // ops.
+  ET_CHECK(
+      check_linear_scratch_example_args(input, weight, bias, out, scratch));
 
   // input @ weight -> scratch
   // TODO: does not handle the case that accumulator has different type
@@ -103,12 +102,7 @@ Tensor& linear_scratch_example(
 
     // add the bias
     if (bias.has_value()) {
-      ET_KERNEL_CHECK_MSG(
-          ctx,
-          K == bias.value().numel(),
-          InvalidArgument,
-          out,
-          "Unexpected numel for bias");
+      ET_CHECK_MSG(K == bias.value().numel(), "Unexpected numel for bias");
       for (size_t i = 0; i < M; ++i) {
         for (size_t j = 0; j < K; ++j) {
           scalar_t* scratch_ptr =
diff --git a/kernels/portable/cpu/op_logit.cpp b/kernels/portable/cpu/op_logit.cpp
index f63b732664..7a54d91d0e 100644
--- a/kernels/portable/cpu/op_logit.cpp
+++ b/kernels/portable/cpu/op_logit.cpp
@@ -28,6 +28,8 @@ Tensor& logit_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   ScalarType out_type = out.scalar_type();
   ET_SWITCH_REAL_TYPES_AND(Bool, in_type, ctx, "logit.out", CTYPE_IN, [&] {
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
index 6f9c32bfca..8f363ced4e 100644
--- a/kernels/portable/cpu/op_max.cpp
+++ b/kernels/portable/cpu/op_max.cpp
@@ -35,19 +35,19 @@ std::tuple<Tensor&, Tensor&> max_out(
       ctx,
       check_min_max_args(in, dim, keepdim, max, max_indices),
       InvalidArgument,
-      std::tuple({max, max_indices}));
+      (std::tuple<Tensor&, Tensor&>({max, max_indices})));
 
   ET_KERNEL_CHECK(
       ctx,
       resize_reduction_out(in, dim, keepdim, max) == Error::Ok,
       InvalidArgument,
-      std::tuple({max, max_indices}));
+      (std::tuple<Tensor&, Tensor&>({max, max_indices})));
 
   ET_KERNEL_CHECK(
       ctx,
       resize_tensor(max_indices, max.sizes()) == Error::Ok,
       InvalidArgument,
-      std::tuple({max, max_indices}));
+      (std::tuple<Tensor&, Tensor&>({max, max_indices})));
 
   dim = dim < 0 ? dim + in.dim() : dim;
 
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
index 65f65c0b4a..8e3b5a00b3 100644
--- a/kernels/portable/cpu/op_min.cpp
+++ b/kernels/portable/cpu/op_min.cpp
@@ -35,19 +35,19 @@ std::tuple<Tensor&, Tensor&> min_out(
       ctx,
       check_min_max_args(in, dim, keepdim, min, min_indices),
       InvalidArgument,
-      std::tuple({min, min_indices}));
+      (std::tuple<Tensor&, Tensor&>({min, min_indices})));
 
   ET_KERNEL_CHECK(
       ctx,
       resize_reduction_out(in, dim, keepdim, min) == Error::Ok,
       InvalidArgument,
-      std::tuple({min, min_indices}));
+      (std::tuple<Tensor&, Tensor&>({min, min_indices})));
 
   ET_KERNEL_CHECK(
       ctx,
       resize_tensor(min_indices, min.sizes()) == Error::Ok,
       InvalidArgument,
-      std::tuple({min, min_indices}));
+      (std::tuple<Tensor&, Tensor&>({min, min_indices})));
 
   dim = dim < 0 ? dim + in.dim() : dim;
 
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index 5b595e48ef..f19ea16c18 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -24,6 +24,8 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
@@ -69,6 +71,8 @@ Tensor& mul_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType common_type =
diff --git a/kernels/portable/cpu/op_nonzero.cpp b/kernels/portable/cpu/op_nonzero.cpp
index 829b091251..6cf7fd81da 100644
--- a/kernels/portable/cpu/op_nonzero.cpp
+++ b/kernels/portable/cpu/op_nonzero.cpp
@@ -39,7 +39,7 @@ void increment_index(size_t* index, const ArrayRef<SizesType> sizes) {
  * out to the appropriate size, and then loop again and properly write into out
  */
 template <typename CTYPE>
-void nonzero(const Tensor& input, Tensor& output) {
+void nonzero(RuntimeContext& ctx, const Tensor& input, Tensor& output) {
   const CTYPE* in_data = input.const_data_ptr<CTYPE>();
   size_t lim = input.numel();
   int32_t num_nonzero = 0;
@@ -58,8 +58,7 @@ void nonzero(const Tensor& input, Tensor& output) {
       ctx,
       resize_tensor(output, ArrayRef<exec_aten::SizesType>(out_shape, 2)) ==
           Error::Ok,
-      InvalidArgument,
-      out);
+      InvalidArgument, );
 
   size_t index[kTensorDimensionLimit];
   memset(index, 0, sizeof(index));
@@ -91,7 +90,7 @@ Tensor& nonzero_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 
   ET_SWITCH_REAL_TYPES_AND(
       Bool, in.scalar_type(), ctx, "nonzero.out", CTYPE, [&] {
-        nonzero<CTYPE>(in, out);
+        nonzero<CTYPE>(ctx, in, out);
       });
 
   return out;
diff --git a/kernels/portable/cpu/op_relu.cpp b/kernels/portable/cpu/op_relu.cpp
index c9dc65be92..b9136cb339 100644
--- a/kernels/portable/cpu/op_relu.cpp
+++ b/kernels/portable/cpu/op_relu.cpp
@@ -33,6 +33,8 @@ Tensor& relu_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "relu.out", CTYPE, [&]() {
     apply_unary_map_fn(
         [](const CTYPE val_in) {
diff --git a/kernels/portable/cpu/op_repeat.cpp b/kernels/portable/cpu/op_repeat.cpp
index ee3859a9e0..644ebc9842 100644
--- a/kernels/portable/cpu/op_repeat.cpp
+++ b/kernels/portable/cpu/op_repeat.cpp
@@ -17,18 +17,18 @@ namespace executor {
 namespace native {
 namespace {
 
-void calculate_output_size(
+bool calculate_output_size(
     const exec_aten::ArrayRef<exec_aten::SizesType>& self_sizes,
     const exec_aten::ArrayRef<int64_t>& repeats,
     Tensor::SizesType* out_sizes_ptr) {
-  ET_KERNEL_CHECK_MSG(
-      ctx,
+  ET_LOG_AND_RETURN_IF_FALSE(repeats.size() < kTensorDimensionLimit);
+
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       repeats.size() >= self_sizes.size(),
-      InvalidArgument,
-      ,
       "Repeats vector size is %zu must be >= self_sizes %zu.",
       repeats.size(),
       self_sizes.size());
+
   int32_t i = 0;
   for (; i < (repeats.size() - self_sizes.size()); ++i) {
     out_sizes_ptr[i] = static_cast<exec_aten::SizesType>(repeats[i]);
@@ -39,6 +39,8 @@ void calculate_output_size(
         static_cast<exec_aten::SizesType>(repeats[i]) * self_sizes[j];
     j++;
   }
+
+  return true;
 }
 
 } // namespace
@@ -53,7 +55,12 @@ Tensor& repeat_out(
     Tensor& out) {
   (void)ctx;
   Tensor::SizesType expected_output_size[kTensorDimensionLimit];
-  calculate_output_size(self.sizes(), repeats, expected_output_size);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      calculate_output_size(self.sizes(), repeats, expected_output_size),
+      InvalidArgument,
+      out);
 
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
@@ -63,7 +70,13 @@ Tensor& repeat_out(
       out,
       "Failed to resize output tensor.");
 
-  return repeat_tensor(self, repeats, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      repeat_tensor(self, repeats, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  return out;
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_round.cpp b/kernels/portable/cpu/op_round.cpp
index 0d7a6a1c71..0b28ba4188 100644
--- a/kernels/portable/cpu/op_round.cpp
+++ b/kernels/portable/cpu/op_round.cpp
@@ -43,6 +43,7 @@ Tensor& round_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
 
   auto in_scalar_type = in.scalar_type();
 
diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp
index 558da05d91..6a5ef598ef 100644
--- a/kernels/portable/cpu/op_rsub.cpp
+++ b/kernels/portable/cpu/op_rsub.cpp
@@ -31,6 +31,8 @@ Tensor& rsub_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType alpha_type = utils::get_scalar_dtype(alpha);
@@ -40,6 +42,7 @@ Tensor& rsub_scalar_out(
   ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
   ET_KERNEL_CHECK(
       ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
 
   ET_SWITCH_REAL_TYPES(a_type, ctx, "rsub.Scalar_out", CTYPE_A, [&]() {
     ET_SWITCH_SCALAR_OBJ_REAL_TYPES(
diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp
index 719cde4d9c..e10d87f919 100644
--- a/kernels/portable/cpu/op_scatter_add.cpp
+++ b/kernels/portable/cpu/op_scatter_add.cpp
@@ -51,13 +51,13 @@ void scatter_add_helper(
 } // namespace
 
 Tensor& scatter_add_out(
-    RuntimeContext& ctx,
+    RuntimeContext& context,
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
     const Tensor& src,
     Tensor& out) {
-  (void)ctx;
+  (void)context;
 
   ET_KERNEL_CHECK(
       context,
diff --git a/kernels/portable/cpu/op_select_scatter.cpp b/kernels/portable/cpu/op_select_scatter.cpp
index b0f4d63a9e..71e7d9dfef 100644
--- a/kernels/portable/cpu/op_select_scatter.cpp
+++ b/kernels/portable/cpu/op_select_scatter.cpp
@@ -37,6 +37,9 @@ Tensor& select_scatter_out(
   if (dim < 0) {
     dim += in.dim();
   }
+
+  ET_KERNEL_CHECK(ctx, dim >= 0 && dim < in.dim(), InvalidArgument, out);
+
   if (index < 0) {
     index += in.size(dim);
   }
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index 3eb8910c1a..b696c29518 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -20,6 +20,10 @@ using Tensor = exec_aten::Tensor;
 Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
 
+  ET_KERNEL_CHECK(
+      ctx, in.scalar_type() != ScalarType::Bool, InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
+
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
       ctx,
diff --git a/kernels/portable/cpu/op_slice_scatter.cpp b/kernels/portable/cpu/op_slice_scatter.cpp
index 43c34478e7..dfa7dfb690 100644
--- a/kernels/portable/cpu/op_slice_scatter.cpp
+++ b/kernels/portable/cpu/op_slice_scatter.cpp
@@ -44,6 +44,8 @@ Tensor& slice_scatter_out(
     return out;
   }
 
+  ET_KERNEL_CHECK(ctx, dim >= 0 && dim < input.dim(), InvalidArgument, out);
+
   // If user do not set value to end_val, set end to input.size(dim) (largest
   // value available)
   int64_t end = end_val.has_value() ? end_val.value() : input.size(dim);
@@ -51,6 +53,8 @@ Tensor& slice_scatter_out(
   // available)
   int64_t start = start_val.has_value() ? start_val.value() : 0;
 
+  ET_KERNEL_CHECK(ctx, step > 0, InvalidArgument, out);
+
   int64_t num_values =
       adjust_slice_indices(input.size(dim), &start, &end, step);
 
diff --git a/kernels/portable/cpu/op_split_copy.cpp b/kernels/portable/cpu/op_split_copy.cpp
index b81ebaded0..a604e76b51 100644
--- a/kernels/portable/cpu/op_split_copy.cpp
+++ b/kernels/portable/cpu/op_split_copy.cpp
@@ -44,8 +44,7 @@ void split_copy_Tensor_out(
   ET_KERNEL_CHECK(
       ctx,
       check_split_copy_args(input, split_size, dim, out),
-      InvalidArgument,
-      out);
+      InvalidArgument, );
 
   const size_t leading_dims = getLeadingDims(input, dim);
   const size_t trailing_dims = getTrailingDims(input, dim);
diff --git a/kernels/portable/cpu/op_split_with_sizes_copy.cpp b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
index b77b7495d5..828b14b24c 100644
--- a/kernels/portable/cpu/op_split_with_sizes_copy.cpp
+++ b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
@@ -36,8 +36,7 @@ void split_with_sizes_copy_out(
   ET_KERNEL_CHECK(
       ctx,
       check_split_with_sizes_copy_args(in, split_sizes, dim, out),
-      InvalidArgument,
-      out);
+      InvalidArgument, );
 
   // If out is empty, then nothing needs to be done after checking the args.
   // Valid args implies that in.size(dim) == 0 and split_sizes is also empty.
@@ -58,8 +57,7 @@ void split_with_sizes_copy_out(
         ctx,
         tensor_is_broadcastable_to(
             {target_out_sizes, target_out_ndim}, out[i].sizes()),
-        InvalidArgument,
-        out);
+        InvalidArgument, );
   }
 
   const size_t leading_dims = getLeadingDims(in, dim);
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
index 8640d68a27..e8a0fc919d 100644
--- a/kernels/portable/cpu/op_sub.cpp
+++ b/kernels/portable/cpu/op_sub.cpp
@@ -29,6 +29,8 @@ Tensor& sub_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType alpha_type = utils::get_scalar_dtype(alpha);
@@ -38,6 +40,7 @@ Tensor& sub_out(
   ET_KERNEL_CHECK(
       ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
 
   ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.out", CTYPE_A, [&]() {
     ET_SWITCH_REALH_TYPES(b_type, ctx, "sub.out", CTYPE_B, [&]() {
@@ -81,6 +84,8 @@ Tensor& sub_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType alpha_type = utils::get_scalar_dtype(b);
diff --git a/kernels/portable/cpu/op_tril.cpp b/kernels/portable/cpu/op_tril.cpp
index 4ff212afed..cdf87bea4b 100644
--- a/kernels/portable/cpu/op_tril.cpp
+++ b/kernels/portable/cpu/op_tril.cpp
@@ -56,13 +56,20 @@ void apply_tril(
  * `tril_out` helper function.
  */
 template <typename CTYPE>
-void tril_kernel(const Tensor& self, int64_t diagonal, const Tensor& out) {
+void tril_kernel(
+    RuntimeContext& ctx,
+    const Tensor& self,
+    int64_t diagonal,
+    const Tensor& out) {
   // Dynamically compute `self` sizes and strides.
 
   int64_t ndim = self.dim();
 
-  ET_CHECK_MSG(
+  ET_KERNEL_CHECK_MSG(
+      ctx,
       ndim < kTensorDimensionLimit,
+      InvalidArgument,
+      ,
       "ndim %" PRId64 " >= %zu",
       ndim,
       kTensorDimensionLimit);
@@ -147,7 +154,7 @@ Tensor& tril_out(
 
   ScalarType out_type = out.scalar_type();
   ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, __func__, CTYPE, [&]() {
-    tril_kernel<CTYPE>(self, diagonal, out);
+    tril_kernel<CTYPE>(ctx, self, diagonal, out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_unbind_copy.cpp b/kernels/portable/cpu/op_unbind_copy.cpp
index d93e041b74..da5a73d624 100644
--- a/kernels/portable/cpu/op_unbind_copy.cpp
+++ b/kernels/portable/cpu/op_unbind_copy.cpp
@@ -34,7 +34,7 @@ void unbind_copy_int_out(
   }
 
   ET_KERNEL_CHECK(
-      ctx, check_unbind_copy_args(input, dim, out), InvalidArgument, out);
+      ctx, check_unbind_copy_args(input, dim, out), InvalidArgument, );
 
   if (input.numel() == 0) {
     return;
diff --git a/kernels/portable/cpu/op_unsqueeze_copy.cpp b/kernels/portable/cpu/op_unsqueeze_copy.cpp
index ef423db0d8..f6d25a0498 100644
--- a/kernels/portable/cpu/op_unsqueeze_copy.cpp
+++ b/kernels/portable/cpu/op_unsqueeze_copy.cpp
@@ -32,7 +32,12 @@ Tensor& unsqueeze_copy_out(
   // are not needed
   if (dim < 0) {
     dim += out.dim();
+    ET_KERNEL_CHECK(ctx, dim >= 0, InvalidArgument, out);
   }
+
+  ET_KERNEL_CHECK(ctx, self.dim() + 1 == out.dim(), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, dim <= self.dim(), InvalidArgument, out);
+
   for (size_t i = 0; i < out.dim(); ++i) {
     if (i < dim) {
       expected_output_size[i] = self.size(i);
diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp
index 7b8debaca0..d9af558c53 100644
--- a/kernels/portable/cpu/op_var.cpp
+++ b/kernels/portable/cpu/op_var.cpp
@@ -71,6 +71,9 @@ Tensor& var_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx,
       resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok,
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp
index bb0be9a4c1..bd0b6e6844 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp
@@ -23,6 +23,8 @@ Tensor& unary_ufunc_realhb_to_floath(
     Tensor& out) {
   (void)ctx;
 
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
+
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
       ctx,
diff --git a/kernels/portable/cpu/util/activation_ops_util.cpp b/kernels/portable/cpu/util/activation_ops_util.cpp
index b697c49e04..273f5d5959 100644
--- a/kernels/portable/cpu/util/activation_ops_util.cpp
+++ b/kernels/portable/cpu/util/activation_ops_util.cpp
@@ -15,6 +15,7 @@ namespace executor {
 
 bool check_gelu_args(const Tensor& in, string_view approximate, Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
+  ET_LOG_AND_RETURN_IF_FALSE(in.scalar_type() != ScalarType::Bool);
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
       approximate == "tanh" || approximate == "none",
       "Invalid approximation format: %.*s for gelu",
diff --git a/kernels/portable/cpu/util/broadcast_util.cpp b/kernels/portable/cpu/util/broadcast_util.cpp
index 4173c1b085..64ef2086ff 100644
--- a/kernels/portable/cpu/util/broadcast_util.cpp
+++ b/kernels/portable/cpu/util/broadcast_util.cpp
@@ -198,7 +198,10 @@ Tensor broadcast_tensor(
       repeats[i] = 1;
     }
   }
-  repeat_tensor(broadcast_from, makeArrayRef(repeats, ndim), out);
+
+  ET_CHECK(
+      repeat_tensor(broadcast_from, makeArrayRef(repeats, ndim), out) ==
+      Error::Ok);
 
   free(repeats);
 
diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h
index 6ca1cf7ee9..77f42c266a 100644
--- a/kernels/portable/cpu/util/broadcast_util.h
+++ b/kernels/portable/cpu/util/broadcast_util.h
@@ -97,7 +97,7 @@ __ET_DEPRECATED exec_aten::Tensor broadcast_tensor(
  * @param[out] out_dim The dimension of the broadcasted target
  * tensor
  */
-[[nodiscard]] Error get_broadcast_target_size(
+__ET_NODISCARD Error get_broadcast_target_size(
     const exec_aten::ArrayRef<Tensor::SizesType> a_size,
     const exec_aten::ArrayRef<Tensor::SizesType> b_size,
     Tensor::SizesType* out_sizes,
@@ -115,7 +115,7 @@ __ET_DEPRECATED exec_aten::Tensor broadcast_tensor(
  * @param[out] out_dim The dimension of the broadcasted target
  * tensor
  */
-[[nodiscard]] Error get_broadcast_target_size(
+__ET_NODISCARD Error get_broadcast_target_size(
     const Tensor& a,
     const Tensor& b,
     Tensor::SizesType* out_sizes,
@@ -130,7 +130,7 @@ __ET_DEPRECATED exec_aten::Tensor broadcast_tensor(
  * @param[in] b The second tensor going to be broadcasted.
  * @param[out] out The output tensor that will be resized.
  */
-[[nodiscard]] inline Error
+__ET_NODISCARD inline Error
 resize_to_broadcast_target_size(const Tensor& a, const Tensor& b, Tensor& out) {
   Tensor::SizesType expected_output_size[kTensorDimensionLimit];
   size_t expected_output_dim = 0;
@@ -156,7 +156,7 @@ resize_to_broadcast_target_size(const Tensor& a, const Tensor& b, Tensor& out) {
  * @param[in] c The third tensor going to be broadcasted.
  * @param[out] out The output tensor that will be resized.
  */
-[[nodiscard]] inline Error resize_to_broadcast_target_size(
+__ET_NODISCARD inline Error resize_to_broadcast_target_size(
     const Tensor& a,
     const Tensor& b,
     const Tensor& c,
diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp
index bea40261ed..26e188b945 100644
--- a/kernels/portable/cpu/util/copy_ops_util.cpp
+++ b/kernels/portable/cpu/util/copy_ops_util.cpp
@@ -114,6 +114,7 @@ bool check_cat_args(
   // Ensure dim is in range.
   ET_LOG_AND_RETURN_IF_FALSE(
       tensors[ref_i].numel() == 0 || tensors[ref_i].dim() > dim);
+  ET_LOG_AND_RETURN_IF_FALSE(dim >= 0);
 
   return true;
 }
@@ -378,6 +379,7 @@ bool check_slice_copy_args(
     int64_t dim,
     int64_t step,
     Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(in.dim() > 0);
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
@@ -737,6 +739,8 @@ bool check_unsqueeze_copy_args(
     const Tensor input,
     int64_t dim,
     const Tensor out) {
+  ET_LOG_AND_RETURN_IF_FALSE(dim >= 0);
+
   // The input and out shall share same dtype
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(input, out));
 
diff --git a/kernels/portable/cpu/util/kernel_ops_util.cpp b/kernels/portable/cpu/util/kernel_ops_util.cpp
index 384b1859b2..fdbc5a0e53 100644
--- a/kernels/portable/cpu/util/kernel_ops_util.cpp
+++ b/kernels/portable/cpu/util/kernel_ops_util.cpp
@@ -462,6 +462,8 @@ bool check_slice_scatter_args(
     int64_t num_values,
     int64_t step,
     Tensor output) {
+  ET_LOG_AND_RETURN_IF_FALSE(input.dim() > 0);
+
   // Check dim. The dim planed to be selected on shall exist in input
   ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, input.dim()));
 
diff --git a/kernels/portable/cpu/util/repeat_util.cpp b/kernels/portable/cpu/util/repeat_util.cpp
index bc721cd493..9acb7ba088 100644
--- a/kernels/portable/cpu/util/repeat_util.cpp
+++ b/kernels/portable/cpu/util/repeat_util.cpp
@@ -20,12 +20,12 @@ using Tensor = exec_aten::Tensor;
 
 namespace {
 
-void check_repeat_args(
+bool check_repeat_args(
     Tensor self,
     exec_aten::ArrayRef<int64_t> repeats,
     Tensor& out) {
   // Ensure the self tensors list is non-empty.
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       repeats.size() >= self.dim(),
       "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
 
@@ -34,11 +34,11 @@ void check_repeat_args(
   for (auto repeat : repeats) {
     all_non_negative = all_non_negative && (repeat >= 0);
   }
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       all_non_negative, "Trying to create tensor with negative dimension");
 
   /// Check if out.size() is legal.
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       out.dim() == repeats.size(),
       "The dimension of out shall equal size of repeats, but now is %zd and %zd",
       out.dim(),
@@ -47,12 +47,12 @@ void check_repeat_args(
   // Right now we only support the tensors whose dimension is no greater than
   // kTensorDimensionLimit. Only check out tensor because the number of
   // dimension of out tensor shall have more than or equal to self tensor
-  ET_CHECK_MSG(
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
       out.dim() <= kTensorDimensionLimit,
       "The dimension of input and output should not be larger than %zd",
       kTensorDimensionLimit);
 
-  ET_CHECK_SAME_DTYPE2(out, self);
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(out, self));
 
   // We pad one to the beginning of self.size() to make its length equal
   // repeats, and called it reformat_self_size. We then make point-to-point mul
@@ -66,13 +66,15 @@ void check_repeat_args(
     reformat_self_size[out.dim() - 1 - i] = self.size(self.dim() - 1 - i);
   }
   for (size_t i = 0; i < repeats.size(); i++) {
-    ET_CHECK_MSG(
+    ET_LOG_MSG_AND_RETURN_IF_FALSE(
         reformat_self_size[i] * repeats[i] == out.size(i),
         "Expect out size at dimension %zu is %" PRId64 ", but now is %zd",
         i,
         reformat_self_size[i] * repeats[i],
         out.size(i));
   }
+
+  return true;
 }
 
 // Given the indices to a point in an n-D tensor, and the stride (in bytes)
@@ -163,16 +165,19 @@ void repeat_internal(
 
 // TODO(gasoonjia): dynamic allocate array to support tensor dimension larger
 // than kTensorDimensionLimit.
-Tensor& repeat_tensor(
+Error repeat_tensor(
     const Tensor& self,
     exec_aten::ArrayRef<int64_t> repeats,
     Tensor& out) {
-  // Assert that the args are valid.
-  check_repeat_args(self, repeats, out);
+  // Verify that the args are valid.
+  ET_CHECK_OR_RETURN_ERROR(
+      check_repeat_args(self, repeats, out),
+      InvalidArgument,
+      "Repeat arguments are invalid.");
 
   // Returns out if out.numel == 0, nothing needs to be repeated.
   if (out.numel() == 0) {
-    return out;
+    return Error::Ok;
   }
 
   ssize_t element_size = out.element_size();
@@ -183,7 +188,7 @@ Tensor& repeat_tensor(
     const char* src = self.const_data_ptr<char>();
     char* dest = out.mutable_data_ptr<char>();
     memcpy(dest, src, element_size);
-    return out;
+    return Error::Ok;
   }
 
   // Treats zero-dim self as one-dim tensor with size {1}.
@@ -274,7 +279,7 @@ Tensor& repeat_tensor(
     accum_offset *= out.size(i);
   }
 
-  return out;
+  return Error::Ok;
 }
 
 } // namespace executor
diff --git a/kernels/portable/cpu/util/repeat_util.h b/kernels/portable/cpu/util/repeat_util.h
index 68e72c8aa8..28f5cfa555 100644
--- a/kernels/portable/cpu/util/repeat_util.h
+++ b/kernels/portable/cpu/util/repeat_util.h
@@ -20,9 +20,9 @@ namespace executor {
  * @param[in] The number of times to repeat this tensor along each dimension
  * @param[in] Output tensor to write to.
  *
- * @returns Repeated tensor.
+ * @returns The status of the repeat operation.
  */
-exec_aten::Tensor& repeat_tensor(
+Error repeat_tensor(
     const exec_aten::Tensor& in,
     exec_aten::ArrayRef<int64_t> repeats,
     exec_aten::Tensor& out);
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 135b8af5af..f7ca5bce92 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -27,6 +27,7 @@ def define_common_targets():
         ],
         exported_headers = ["repeat_util.h"],
         deps = [
+            "//executorch/runtime/kernel:kernel_includes",
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index e302eb5805..ca3f3c702a 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -253,6 +253,11 @@
     - arg_meta: null
       kernel_name: torch::executor::copy_out
 
+- op: copy_
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::copy_
+
 - op: cos.out
   kernels:
     - arg_meta: null
diff --git a/kernels/portable/targets.bzl b/kernels/portable/targets.bzl
index 430af8bf6a..9e96de61c9 100644
--- a/kernels/portable/targets.bzl
+++ b/kernels/portable/targets.bzl
@@ -102,15 +102,6 @@ def define_common_targets():
         **generated_lib_common_args
     )
 
-    executorch_generated_lib(
-        name = "generated_lib_all_ops",
-        deps = [
-            ":executorch_all_ops",
-            "//executorch/kernels/portable:operators",
-        ],
-        **generated_lib_common_args
-    )
-
     executorch_generated_lib(
         name = "generated_lib_aten",
         deps = [
diff --git a/kernels/portable/test/op_allclose_test.cpp b/kernels/portable/test/op_allclose_test.cpp
index e3f9b49441..25dbebce2a 100644
--- a/kernels/portable/test/op_allclose_test.cpp
+++ b/kernels/portable/test/op_allclose_test.cpp
@@ -11,6 +11,7 @@
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
 #include <cmath>
@@ -188,14 +189,16 @@ TEST(OpAllCloseTest, MismatchedInputShapesDeath) {
   TensorFactory<ScalarType::Bool> tf_bool;
   Tensor out = tf_bool.zeros(/*sizes=*/{1});
 
-  ET_EXPECT_KERNEL_FAILURE(allclose_out(
-      a,
-      b,
-      default_rtol,
-      default_atol,
-      /*equal_nan=*/false,
-      /*dummy_param=*/false,
-      out));
+  ET_EXPECT_DEATH(
+      allclose_out(
+          a,
+          b,
+          default_rtol,
+          default_atol,
+          /*equal_nan=*/false,
+          /*dummy_param=*/false,
+          out),
+      "");
 }
 
 TEST(OpAllCloseTest, MismatchedInputDtypesDeath) {
@@ -208,14 +211,16 @@ TEST(OpAllCloseTest, MismatchedInputDtypesDeath) {
   TensorFactory<ScalarType::Bool> tf_bool;
   Tensor out = tf_bool.zeros(/*sizes=*/{1});
 
-  ET_EXPECT_KERNEL_FAILURE(allclose_out(
-      a,
-      b,
-      default_rtol,
-      default_atol,
-      /*equal_nan=*/false,
-      /*dummy_param=*/false,
-      out));
+  ET_EXPECT_DEATH(
+      allclose_out(
+          a,
+          b,
+          default_rtol,
+          default_atol,
+          /*equal_nan=*/false,
+          /*dummy_param=*/false,
+          out),
+      "");
 }
 
 TEST(OpAllCloseTest, IncorrectOutputDtypeDeath) {
@@ -224,14 +229,16 @@ TEST(OpAllCloseTest, IncorrectOutputDtypeDeath) {
   Tensor b = tf_float.ones(/*sizes=*/{2, 2});
   Tensor out = tf_float.zeros(/*sizes=*/{1});
 
-  ET_EXPECT_KERNEL_FAILURE(allclose_out(
-      a,
-      b,
-      default_rtol,
-      default_atol,
-      /*equal_nan=*/false,
-      /*dummy_param=*/false,
-      out));
+  ET_EXPECT_DEATH(
+      allclose_out(
+          a,
+          b,
+          default_rtol,
+          default_atol,
+          /*equal_nan=*/false,
+          /*dummy_param=*/false,
+          out),
+      "");
 }
 
 TEST(OpAllCloseTest, IncorrectOutputShapeDeath) {
@@ -241,14 +248,16 @@ TEST(OpAllCloseTest, IncorrectOutputShapeDeath) {
   TensorFactory<ScalarType::Bool> tf_bool;
   Tensor out = tf_bool.zeros(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(allclose_out(
-      a,
-      b,
-      default_rtol,
-      default_atol,
-      /*equal_nan=*/false,
-      /*dummy_param=*/false,
-      out));
+  ET_EXPECT_DEATH(
+      allclose_out(
+          a,
+          b,
+          default_rtol,
+          default_atol,
+          /*equal_nan=*/false,
+          /*dummy_param=*/false,
+          out),
+      "");
 }
 
 TEST(OpAllCloseTest, FloatTensorsVaryWithinRelativeTolerance) {
diff --git a/kernels/portable/test/op_div_test.cpp b/kernels/portable/test/op_div_test.cpp
index c2a5591cfe..70d5e194fa 100644
--- a/kernels/portable/test/op_div_test.cpp
+++ b/kernels/portable/test/op_div_test.cpp
@@ -27,25 +27,29 @@ using torch::executor::testing::TensorFactory;
 // If your test case is generic and should be tested on all kernels, add it to
 // executorch/kernels/test/op_div_test.cpp instead.
 
-Tensor& op_div_out_mode(
-    const Tensor& a,
-    const Tensor& b,
-    exec_aten::optional<exec_aten::string_view> mode,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::div_outf(context, a, b, mode, out);
-}
-
-Tensor& op_div_scalar_mode_out(
-    const Tensor& a,
-    const Scalar& b,
-    exec_aten::optional<exec_aten::string_view> mode,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::div_outf(context, a, b, mode, out);
-}
-
-TEST(OpDivScalarOutKernelTest, SanityCheckModeTrunc) {
+class OpDivScalarOutKernelTest : public OperatorTest {
+ protected:
+  Tensor& op_div_out_mode(
+      const Tensor& a,
+      const Tensor& b,
+      exec_aten::optional<exec_aten::string_view> mode,
+      Tensor& out) {
+    return torch::executor::aten::div_outf(context_, a, b, mode, out);
+  }
+};
+
+class OpDivScalarModeOutKernelTest : public OperatorTest {
+ protected:
+  Tensor& op_div_scalar_mode_out(
+      const Tensor& a,
+      const Scalar& b,
+      exec_aten::optional<exec_aten::string_view> mode,
+      Tensor& out) {
+    return torch::executor::aten::div_outf(context_, a, b, mode, out);
+  }
+};
+
+TEST_F(OpDivScalarOutKernelTest, SanityCheckModeTrunc) {
   TensorFactory<ScalarType::Int> tf_a;
   TensorFactory<ScalarType::Float> tf_out;
 
@@ -63,7 +67,7 @@ TEST(OpDivScalarOutKernelTest, SanityCheckModeTrunc) {
   EXPECT_TENSOR_EQ(out, tf_out.make(sizes, {0.0, 1.0, 2.0, -4.0}));
 }
 
-TEST(OpDivScalarOutKernelTest, SanityCheckModeFloor) {
+TEST_F(OpDivScalarOutKernelTest, SanityCheckModeFloor) {
   TensorFactory<ScalarType::Int> tf_a;
   TensorFactory<ScalarType::Float> tf_out;
 
@@ -81,7 +85,7 @@ TEST(OpDivScalarOutKernelTest, SanityCheckModeFloor) {
   EXPECT_TENSOR_EQ(out, tf_out.make(sizes, {0.0, 1.0, 2.0, -5.0}));
 }
 
-TEST(OpDivScalarModeOutKernelTest, SanityCheckModeTrunc) {
+TEST_F(OpDivScalarModeOutKernelTest, SanityCheckModeTrunc) {
   TensorFactory<ScalarType::Int> tf_a;
   TensorFactory<ScalarType::Float> tf_out;
 
@@ -99,7 +103,7 @@ TEST(OpDivScalarModeOutKernelTest, SanityCheckModeTrunc) {
   EXPECT_TENSOR_EQ(out, tf_out.make(sizes, {0.0, 1.0, 2.0, -4.0}));
 }
 
-TEST(OpDivScalarModeOutKernelTest, SanityCheckModeFloor) {
+TEST_F(OpDivScalarModeOutKernelTest, SanityCheckModeFloor) {
   TensorFactory<ScalarType::Int> tf_a;
   TensorFactory<ScalarType::Float> tf_out;
 
diff --git a/kernels/portable/test/op_mul_test.cpp b/kernels/portable/test/op_mul_test.cpp
index 377d6c4139..0c7f300dcf 100644
--- a/kernels/portable/test/op_mul_test.cpp
+++ b/kernels/portable/test/op_mul_test.cpp
@@ -26,12 +26,14 @@ using torch::executor::testing::TensorFactory;
 // If your test case is generic and should be tested on all kernels, add it to
 // executorch/kernels/test/op_mul_test.cpp instead.
 
-Tensor& mul_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::native::mul_out(context, self, other, out);
-}
+class OpMulOutKernelTest : public OperatorTest {
+ protected:
+  Tensor& mul_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::native::mul_out(context_, self, other, out);
+  }
+};
 
-TEST(OpMulOutKernelTest, UnhandledDtypeDies) {
+TEST_F(OpMulOutKernelTest, UnhandledDtypeDies) {
   // mul_out() doesn't handle QInt8.
   // TensorFactory cannot be used with ScalarType::QInt8 since
   // torch::executor::qint8 does not have a default constructor. It must be
@@ -64,5 +66,5 @@ TEST(OpMulOutKernelTest, UnhandledDtypeDies) {
 
   // Multiplying the two QInt8 tensors should cause an assertion and
   // kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(mul_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, mul_out(a, b, out));
 }
diff --git a/kernels/portable/test/targets.bzl b/kernels/portable/test/targets.bzl
index ae0dbaef40..261ec50d76 100644
--- a/kernels/portable/test/targets.bzl
+++ b/kernels/portable/test/targets.bzl
@@ -8,7 +8,7 @@ def define_common_targets():
     """
     define_supported_features_lib()
 
-    op_test(name = "op_allclose_test", aten_compatible = False)
+    op_test(name = "op_allclose_test")
     op_test(name = "op_div_test")
     op_test(name = "op_gelu_test")
     op_test(name = "op_mul_test")
diff --git a/kernels/quantized/cpu/op_dequantize.cpp b/kernels/quantized/cpu/op_dequantize.cpp
index 129b9a4ad5..b90adccaf2 100644
--- a/kernels/quantized/cpu/op_dequantize.cpp
+++ b/kernels/quantized/cpu/op_dequantize.cpp
@@ -33,6 +33,7 @@ void check_dequantize_per_tensor_args(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
+    exec_aten::optional<ScalarType>& out_dtype,
     Tensor& out) {
   ET_CHECK_MSG(
       input.scalar_type() == ScalarType::Byte ||
@@ -47,10 +48,11 @@ void check_dequantize_per_tensor_args(
       "input.scalar_type() %" PRId8 " is not matching dtype argumenta:",
       static_cast<int8_t>(input.scalar_type()));
 
-  ET_CHECK_MSG(
-      out.scalar_type() == ScalarType::Float,
-      "out.scalar_type() %" PRId8 " is not supported:",
-      static_cast<int8_t>(out.scalar_type()));
+  if (out_dtype.has_value()) {
+    ET_CHECK_MSG(
+        out.scalar_type() == out_dtype.value(),
+        "output_dtype must match the dtype of the out tensor");
+  }
 
   ET_CHECK_MSG(
       quant_min <= quant_max,
@@ -77,13 +79,15 @@ Tensor& dequantize_per_tensor_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
+    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   torch::executor::Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
       err == torch::executor::Error::Ok,
       "Failed to resize out Tensor in dequantize_per_tensor_out");
 
-  check_dequantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
+  check_dequantize_per_tensor_args(
+      input, quant_min, quant_max, dtype, out_dtype, out);
 
   // calculate the dequantized output, cast scale to float to match fbgemm
   // behavior
@@ -128,6 +132,7 @@ Tensor& dequantize_per_tensor_tensor_args_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
+    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   ET_CHECK_MSG(
       scale.scalar_type() == ScalarType::Double,
@@ -153,6 +158,7 @@ Tensor& dequantize_per_tensor_tensor_args_out(
       quant_min,
       quant_max,
       dtype,
+      out_dtype,
       out);
   return out;
 }
@@ -160,11 +166,12 @@ Tensor& dequantize_per_tensor_tensor_args_out(
 Tensor& dequantize_per_channel_out(
     const Tensor& input,
     const Tensor& scale,
-    const Tensor& zero_point,
+    const optional<Tensor>& opt_zero_points,
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
+    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   torch::executor::Error err = resize_tensor(out, input.sizes());
 
@@ -194,18 +201,22 @@ Tensor& dequantize_per_channel_out(
       ssize_t(scale.numel()),
       ssize_t(input.size(axis)));
 
-  ET_CHECK_MSG(
-      zero_point.scalar_type() == ScalarType::Long,
-      "zero_point.scalar_type() %" PRId8 " is not integer type",
-      static_cast<int8_t>(zero_point.scalar_type()));
-
-  ET_CHECK_MSG(
-      zero_point.numel() == input.size(axis),
-      "zero_point.numel() %zd != input.size(axis) %zd",
-      ssize_t(zero_point.numel()),
-      ssize_t(input.size(axis)));
+  if (opt_zero_points.has_value()) {
+    auto zero_point = opt_zero_points.value();
+    ET_CHECK_MSG(
+        zero_point.scalar_type() == ScalarType::Long,
+        "zero_point.scalar_type() %" PRId8 " is not integer type",
+        static_cast<int8_t>(zero_point.scalar_type()));
+
+    ET_CHECK_MSG(
+        zero_point.numel() == input.size(axis),
+        "zero_point.numel() %zd != input.size(axis) %zd",
+        ssize_t(zero_point.numel()),
+        ssize_t(input.size(axis)));
+  }
 
-  check_dequantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
+  check_dequantize_per_tensor_args(
+      input, quant_min, quant_max, dtype, out_dtype, out);
 
   // a list contains all dimensions except axis
   int64_t dims[input.dim() - 1];
@@ -217,7 +228,12 @@ Tensor& dequantize_per_channel_out(
     }
   }
   const double* scale_data = scale.const_data_ptr<double>();
-  const int64_t* zero_point_data = zero_point.const_data_ptr<int64_t>();
+  const int64_t* zero_point_data;
+  if (opt_zero_points.has_value()) {
+    zero_point_data = opt_zero_points.value().const_data_ptr<int64_t>();
+  } else {
+    zero_point_data = nullptr;
+  }
 
   exec_aten::optional<exec_aten::ArrayRef<int64_t>> optional_dim_list{
       exec_aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
@@ -234,7 +250,10 @@ Tensor& dequantize_per_channel_out(
   case ScalarType::out_dtype:                                                  \
     for (size_t channel_ix = 0; channel_ix < input.size(axis); ++channel_ix) { \
       double _scale = scale_data[channel_ix];                                  \
-      int64_t _zero_point = zero_point_data[channel_ix];                       \
+      int64_t _zero_point = 0;                                                 \
+      if (zero_point_data != nullptr) {                                        \
+        _zero_point = zero_point_data[channel_ix];                             \
+      }                                                                        \
       apply_over_dim_list(                                                     \
           [input, out, _scale, _zero_point](size_t in_ix) {                    \
             out.mutable_data_ptr<CTYPE_OUT>()[in_ix] = static_cast<CTYPE_OUT>( \
@@ -276,15 +295,24 @@ Tensor& dequantize_per_channel_out(
     RuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
-    const Tensor& zero_point,
+    const optional<Tensor>& opt_zero_points,
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
+    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   (void)context;
   return dequantize_per_channel_out(
-      input, scale, zero_point, axis, quant_min, quant_max, dtype, out);
+      input,
+      scale,
+      opt_zero_points,
+      axis,
+      quant_min,
+      quant_max,
+      dtype,
+      out_dtype,
+      out);
 }
 
 Tensor& dequantize_per_tensor_out(
@@ -295,12 +323,13 @@ Tensor& dequantize_per_tensor_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
+    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
   (void)context;
   return dequantize_per_tensor_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out);
+      input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out);
 }
 
 Tensor& dequantize_per_tensor_tensor_args_out(
@@ -311,12 +340,13 @@ Tensor& dequantize_per_tensor_tensor_args_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
+    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
   (void)context;
   return dequantize_per_tensor_tensor_args_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out);
+      input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out);
 }
 
 } // namespace native
diff --git a/kernels/quantized/cpu/op_embedding.cpp b/kernels/quantized/cpu/op_embedding.cpp
index 514125ed46..0ecde1cfb8 100644
--- a/kernels/quantized/cpu/op_embedding.cpp
+++ b/kernels/quantized/cpu/op_embedding.cpp
@@ -41,26 +41,21 @@ void check_embedding_byte_args(
       "weight_scales must be 1D or 2D but got() %zd dims",
       weight_scales.dim());
 
-  auto weight_scales_size = weight_scales.size(0);
-
   ET_CHECK_MSG(
-      weight_scales_size == weight.size(0),
+      weight_scales.size(0) == weight.size(0),
       "Number of scales must be == weight.size(0)=%zd"
       ", but got %zd",
-      weight_scales_size,
+      weight_scales.size(0),
       weight.size(0));
 
-  if (weight_scales_size >= weight.size(0)) {
-    if (weight_scales.dim() == 2) {
-      auto num_groups = weight_scales.size(1);
-      auto remainder = weight.size(1) % num_groups;
-      ET_CHECK_MSG(
-          remainder == 0,
-          "Number of groups must divide weight.size(1)=%zd"
-          ", but got # of groups = %zd",
-          weight.size(1),
-          num_groups);
-    }
+  if (weight_scales.dim() == 2) {
+    auto num_groups = weight_scales.size(1);
+    ET_CHECK_MSG(
+        weight.size(1) % num_groups == 0,
+        "Number of groups must divide weight.size(1)=%zd"
+        ", but got # of groups = %zd",
+        weight.size(1),
+        num_groups);
   }
 
   ET_CHECK_MSG(
diff --git a/kernels/quantized/cpu/op_mixed_linear.cpp b/kernels/quantized/cpu/op_mixed_linear.cpp
index a9d945b887..7549e0476c 100644
--- a/kernels/quantized/cpu/op_mixed_linear.cpp
+++ b/kernels/quantized/cpu/op_mixed_linear.cpp
@@ -67,12 +67,9 @@ Tensor& quantized_mixed_linear_out(
     const optional<Tensor>& opt_weight_zero_points,
     const optional<ScalarType> dtype,
     Tensor& out) {
-  ET_KERNEL_CHECK(
-      ctx,
-      check_quantized_mixed_linear_args(
-          in, weight, weight_scales, opt_weight_zero_points, dtype, out),
-      InvalidArgument,
-      out);
+  // TODO (gjcomer) Replace with ET_KERNEL_CHECK when context is available.
+  ET_CHECK(check_quantized_mixed_linear_args(
+      in, weight, weight_scales, opt_weight_zero_points, dtype, out));
 
   ScalarType out_dtype = dtype.has_value() ? dtype.value() : out.scalar_type();
 
@@ -81,11 +78,8 @@ Tensor& quantized_mixed_linear_out(
   output_sizes[0] = in.size(0);
   output_sizes[1] = weight.size(0);
 
-  ET_KERNEL_CHECK(
-      ctx,
-      resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok,
-      InvalidArgument,
-      out);
+  // TODO (gjcomer) Replace with ET_KERNEL_CHECK when context is available.
+  ET_CHECK(resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok);
 
   constexpr auto name = "quantized_decomposed::mixed_linear.out";
 
diff --git a/kernels/quantized/cpu/op_mixed_mm.cpp b/kernels/quantized/cpu/op_mixed_mm.cpp
index 33f68813cb..c11d6cde5e 100644
--- a/kernels/quantized/cpu/op_mixed_mm.cpp
+++ b/kernels/quantized/cpu/op_mixed_mm.cpp
@@ -57,23 +57,15 @@ Tensor& quantized_mixed_mm_out(
     const Tensor& weight_scales,
     const optional<Tensor>& opt_weight_zero_points,
     Tensor& out) {
-  ET_KERNEL_CHECK(
-      ctx,
-      check_quantized_mixed_mm_args(
-          in, weight, weight_scales, opt_weight_zero_points, out),
-      InvalidArgument,
-      out);
+  ET_CHECK(check_quantized_mixed_mm_args(
+      in, weight, weight_scales, opt_weight_zero_points, out));
 
   size_t output_ndim = 2;
   exec_aten::SizesType output_sizes[kTensorDimensionLimit];
   output_sizes[0] = in.size(0);
   output_sizes[1] = weight.size(1);
 
-  ET_KERNEL_CHECK(
-      ctx,
-      resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok,
-      InvalidArgument,
-      out);
+  ET_CHECK(resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok);
 
   constexpr auto name = "quantized_decomposed::mixed_mm.out";
 
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
index b7d8ea449d..824396c618 100644
--- a/kernels/quantized/cpu/op_quantize.cpp
+++ b/kernels/quantized/cpu/op_quantize.cpp
@@ -153,6 +153,7 @@ Tensor& quantize_per_tensor_out(
 }
 
 Tensor& quantize_per_tensor_tensor_args_out(
+    RuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
     const Tensor& zero_point,
@@ -160,6 +161,14 @@ Tensor& quantize_per_tensor_tensor_args_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
+  // Temporary change to allow not fatal failure for now to unblock some
+  // expected failure tests that are dying instead of failure. Will revisit
+  // after ET_KERNEL_CHECK is fully implemented and properly allows non fatal
+  // failures.
+  if (scale.scalar_type() != ScalarType::Double) {
+    context.fail(torch::executor::Error::InvalidArgument);
+    return out;
+  }
   ET_CHECK_MSG(
       scale.scalar_type() == ScalarType::Double,
       "Expected scale to be Double tensor received: %" PRId8,
@@ -188,28 +197,26 @@ Tensor& quantize_per_tensor_tensor_args_out(
   return out;
 }
 
-Tensor& quantize_per_tensor_out(
-    RuntimeContext& context,
-
+Tensor& quantize_per_tensor_tensor_args_out(
     const Tensor& input,
-    double scale,
-    int64_t zero_point,
+    const Tensor& scale,
+    const Tensor& zero_point,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
-  // TODO(larryliu): Add a context arg to the real op function and remove this
-  // wrapper
-  (void)context;
-  return quantize_per_tensor_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out);
+  auto context = torch::executor::RuntimeContext();
+  auto& res = quantize_per_tensor_tensor_args_out(
+      context, input, scale, zero_point, quant_min, quant_max, dtype, out);
+  ET_CHECK(context.failure_state() == Error::Ok);
+  return res;
 }
 
-Tensor& quantize_per_tensor_tensor_args_out(
+Tensor& quantize_per_tensor_out(
     RuntimeContext& context,
     const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
+    double scale,
+    int64_t zero_point,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
@@ -217,7 +224,7 @@ Tensor& quantize_per_tensor_tensor_args_out(
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
   (void)context;
-  return quantize_per_tensor_tensor_args_out(
+  return quantize_per_tensor_out(
       input, scale, zero_point, quant_min, quant_max, dtype, out);
 }
 
diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml
index 9f802e33c6..484641318b 100644
--- a/kernels/quantized/quantized.yaml
+++ b/kernels/quantized/quantized.yaml
@@ -10,13 +10,13 @@
     - arg_meta: null
       kernel_name: torch::executor::choose_qparams_tensor_out
 
-- func: quantized_decomposed::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+- func: quantized_decomposed::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::dequantize_per_tensor_out
 
-- func: quantized_decomposed::dequantize_per_tensor.Tensor_out(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+- func: quantized_decomposed::dequantize_per_tensor.Tensor_out(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
@@ -28,7 +28,7 @@
     - arg_meta: null
       kernel_name: torch::executor::quantize_per_channel_out
 
-- func: quantized_decomposed::dequantize_per_channel.out(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+- func: quantized_decomposed::dequantize_per_channel.out(Tensor input, Tensor scales, Tensor? zero_points, int axis, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
@@ -40,7 +40,7 @@
     - arg_meta: null
       kernel_name: torch::executor::quantized_embedding_byte_out
 
-- func: quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+- func: quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
diff --git a/kernels/quantized/test/op_add_test.cpp b/kernels/quantized/test/op_add_test.cpp
index 92c6fa2052..a48ba10c66 100644
--- a/kernels/quantized/test/op_add_test.cpp
+++ b/kernels/quantized/test/op_add_test.cpp
@@ -20,6 +20,7 @@
 
 using namespace ::testing;
 using exec_aten::ArrayRef;
+using exec_aten::optional;
 using exec_aten::RuntimeContext;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
@@ -190,6 +191,8 @@ TEST(OpQuantizeAddTest, ConsitencyWithReferencePattern) {
   Tensor qinput2 = tfo.zeros({3, 5});
   Tensor qoutput = tfo.zeros({3, 5});
 
+  optional<ScalarType> out_dtype = optional<ScalarType>();
+
   RuntimeContext context{};
   // q -> qadd -> dq
   // 3.5 / 0.5 + 1 = 8
@@ -235,6 +238,7 @@ TEST(OpQuantizeAddTest, ConsitencyWithReferencePattern) {
       quant_min,
       quant_max,
       ScalarType::Byte,
+      out_dtype,
       reference_op_output);
 
   // now get results for q -> dq -> fp add -> q -> dq
@@ -245,6 +249,7 @@ TEST(OpQuantizeAddTest, ConsitencyWithReferencePattern) {
       quant_min,
       quant_max,
       ScalarType::Byte,
+      out_dtype,
       dq_input1);
 
   dequantize_per_tensor_out(
@@ -254,6 +259,7 @@ TEST(OpQuantizeAddTest, ConsitencyWithReferencePattern) {
       quant_min,
       quant_max,
       ScalarType::Byte,
+      out_dtype,
       dq_input2);
 
   add_out(context, dq_input1, dq_input2, 1.0, fp_output);
@@ -274,6 +280,7 @@ TEST(OpQuantizeAddTest, ConsitencyWithReferencePattern) {
       quant_min,
       quant_max,
       ScalarType::Byte,
+      out_dtype,
       reference_pattern_output);
 
   Tensor expected = tf.full({3, 5}, 7.0);
diff --git a/kernels/quantized/test/op_dequantize_test.cpp b/kernels/quantized/test/op_dequantize_test.cpp
index 57225e4576..1004126d04 100644
--- a/kernels/quantized/test/op_dequantize_test.cpp
+++ b/kernels/quantized/test/op_dequantize_test.cpp
@@ -18,6 +18,7 @@
 
 using namespace ::testing;
 using exec_aten::ArrayRef;
+using exec_aten::optional;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
@@ -43,7 +44,14 @@ void test_dtype() {
   // (100 - 30) * 0.5
   Tensor expected = tfo.full({3, 5}, 35);
   dequantize_per_tensor_out(
-      input, scale, zero_point, quant_min, quant_max, DTYPE, out);
+      input,
+      scale,
+      zero_point,
+      quant_min,
+      quant_max,
+      DTYPE,
+      optional<ScalarType>(),
+      out);
 
   EXPECT_TENSOR_EQ(out, expected);
 }
@@ -66,7 +74,14 @@ TEST(OpDequantizeOutTest, NonWholeNumbers) {
   // (100 - 30) * 0.5
   Tensor expected = tfo.full({3, 5}, 31.5);
   dequantize_per_tensor_out(
-      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+      input,
+      scale,
+      zero_point,
+      quant_min,
+      quant_max,
+      ScalarType::Byte,
+      optional<ScalarType>(),
+      out);
 
   EXPECT_TENSOR_EQ(out, expected);
 }
@@ -87,7 +102,14 @@ TEST(OpDequantizeOutTest, TensorArgOverload) {
   // (100 - 30) * 0.5
   Tensor expected = tfo.full({3, 5}, 31.5);
   dequantize_per_tensor_tensor_args_out(
-      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+      input,
+      scale,
+      zero_point,
+      quant_min,
+      quant_max,
+      ScalarType::Byte,
+      optional<ScalarType>(),
+      out);
 
   EXPECT_TENSOR_EQ(out, expected);
 }
@@ -116,6 +138,7 @@ TEST(OpDequantizeOutTest, DequantizePerChannel) {
       quant_min,
       quant_max,
       ScalarType::Byte,
+      optional<ScalarType>(),
       out);
 
   EXPECT_TENSOR_EQ(out, expected);
@@ -136,6 +159,7 @@ TEST(OpDequantizeOutTest, DequantizePerChannel) {
       quant_min,
       quant_max,
       ScalarType::Byte,
+      optional<ScalarType>(),
       out);
 
   EXPECT_TENSOR_EQ(out, expected);
diff --git a/kernels/quantized/test/op_embedding_test.cpp b/kernels/quantized/test/op_embedding_test.cpp
index 49605977cc..76114561e5 100644
--- a/kernels/quantized/test/op_embedding_test.cpp
+++ b/kernels/quantized/test/op_embedding_test.cpp
@@ -20,6 +20,7 @@
 
 using namespace ::testing;
 using exec_aten::ArrayRef;
+using exec_aten::optional;
 using exec_aten::RuntimeContext;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
@@ -149,6 +150,7 @@ TEST(OpQuantizedEmbeddingTest, ConsitencyWithReferencePattern) {
       quant_min,
       quant_max,
       ScalarType::Byte,
+      optional<ScalarType>(),
       weight);
 
   embedding_out(
diff --git a/kernels/quantized/test/op_quantize_test.cpp b/kernels/quantized/test/op_quantize_test.cpp
index 8f82601bf4..be79fe0998 100644
--- a/kernels/quantized/test/op_quantize_test.cpp
+++ b/kernels/quantized/test/op_quantize_test.cpp
@@ -68,8 +68,16 @@ TEST(OpQuantizeOutTest, TensorArgOverload) {
   Tensor out = tfo.zeros({3, 5});
   // 4 / 0.5 + 127
   Tensor expected = tfo.full({3, 5}, 135);
+  auto context = torch::executor::KernelRuntimeContext();
   quantize_per_tensor_tensor_args_out(
-      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+      context,
+      input,
+      scale,
+      zero_point,
+      quant_min,
+      quant_max,
+      ScalarType::Byte,
+      out);
 
   EXPECT_TENSOR_EQ(out, expected);
 }
@@ -93,8 +101,16 @@ TEST(OpQuantizeOutTest, TestOutOfBounds) {
 
   Tensor expected = tfo.full({1, 3, 256, 256}, 127);
 
+  auto context = torch::executor::KernelRuntimeContext();
   quantize_per_tensor_tensor_args_out(
-      input, scale, zero_point, quant_min, quant_max, ScalarType::Char, out);
+      context,
+      input,
+      scale,
+      zero_point,
+      quant_min,
+      quant_max,
+      ScalarType::Char,
+      out);
 
   EXPECT_TENSOR_EQ(out, expected);
 }
diff --git a/kernels/test/TestUtil.h b/kernels/test/TestUtil.h
index a8ebc21c0f..ed72dbc412 100644
--- a/kernels/test/TestUtil.h
+++ b/kernels/test/TestUtil.h
@@ -13,6 +13,9 @@
 
 #pragma once
 
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
 #include <gtest/gtest.h>
 
@@ -21,16 +24,62 @@
  * Ensure the kernel will fail when `_statement` is executed.
  * @param _statement Statement to execute.
  */
-#define ET_EXPECT_KERNEL_FAILURE(_statement) EXPECT_ANY_THROW(_statement)
+#define ET_EXPECT_KERNEL_FAILURE(_context, _statement) \
+  EXPECT_ANY_THROW(_statement)
 
-#define ET_EXPECT_KERNEL_FAILURE_WITH_MSG(_statement, _matcher) \
+#define ET_EXPECT_KERNEL_FAILURE_WITH_MSG(_context, _statement, _matcher) \
   EXPECT_ANY_THROW(_statement)
 
 #else
 
-#define ET_EXPECT_KERNEL_FAILURE(_statement) ET_EXPECT_DEATH(_statement, "")
+#define ET_EXPECT_KERNEL_FAILURE(_context, _statement)              \
+  do {                                                              \
+    _statement;                                                     \
+    expect_failure();                                               \
+    if ((_context).failure_state() == torch::executor::Error::Ok) { \
+      ET_LOG(Error, "Expected kernel failure but found success.");  \
+      ADD_FAILURE();                                                \
+    }                                                               \
+  } while (false)
 
-#define ET_EXPECT_KERNEL_FAILURE_WITH_MSG(_statement, _matcher) \
-  ET_EXPECT_DEATH(_statement, _matcher)
+#define ET_EXPECT_KERNEL_FAILURE_WITH_MSG(_context, _statement, _msg) \
+  do {                                                                \
+    _statement;                                                       \
+    expect_failure();                                                 \
+    if ((_context).failure_state() == torch::executor::Error::Ok) {   \
+      ET_LOG(Error, "Expected kernel failure but found success.");    \
+      ADD_FAILURE();                                                  \
+    }                                                                 \
+  } while (false)
 
 #endif // USE_ATEN_LIB
+
+/*
+ * Common test fixture for kernel / operator-level tests. Provides
+ * a runtime context object and verifies failure state post-execution.
+ */
+class OperatorTest : public ::testing::Test {
+ public:
+  OperatorTest() : expect_failure_(false) {}
+
+  void SetUp() override {
+    torch::executor::runtime_init();
+  }
+
+  void TearDown() override {
+    // Validate error state.
+    if (!expect_failure_) {
+      EXPECT_EQ(context_.failure_state(), torch::executor::Error::Ok);
+    } else {
+      EXPECT_NE(context_.failure_state(), torch::executor::Error::Ok);
+    }
+  }
+
+  void expect_failure() {
+    expect_failure_ = true;
+  }
+
+ protected:
+  exec_aten::RuntimeContext context_;
+  bool expect_failure_;
+};
diff --git a/kernels/test/custom_kernel_example/op_relu.cpp b/kernels/test/custom_kernel_example/op_relu.cpp
index 835bd8bc47..e86d0ca4e6 100644
--- a/kernels/test/custom_kernel_example/op_relu.cpp
+++ b/kernels/test/custom_kernel_example/op_relu.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <cmath>
+#include <stdexcept>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -63,7 +64,11 @@ void relu(const Tensor& input, Tensor& output) {
 Tensor& my_relu_out(RuntimeContext& context, const Tensor& input, Tensor& out) {
   (void)context;
   resize(out, input.sizes());
-  ET_CHECK_SAME_SHAPE_AND_DTYPE2(input, out);
+  ET_KERNEL_CHECK(
+      context,
+      executor::tensors_have_same_shape_and_dtype(input, out),
+      InvalidArgument,
+      out);
 
 // helper for generating the cases for different data types
 #define RELU(ctype, dtype)   \
@@ -74,7 +79,13 @@ Tensor& my_relu_out(RuntimeContext& context, const Tensor& input, Tensor& out) {
   switch (input.scalar_type()) {
     ET_FORALL_REAL_TYPES(RELU)
     default:
-      ET_CHECK_MSG(false, "Unhandled dtype %hhd", input.scalar_type());
+      ET_KERNEL_CHECK_MSG(
+          context,
+          false,
+          InvalidArgument,
+          out,
+          "Unhandled dtype %hhd",
+          input.scalar_type());
   }
 #undef RELU
 
diff --git a/kernels/test/op_abs_test.cpp b/kernels/test/op_abs_test.cpp
index e06911e8e8..b54cd97156 100644
--- a/kernels/test/op_abs_test.cpp
+++ b/kernels/test/op_abs_test.cpp
@@ -19,12 +19,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_abs_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::abs_outf(context, self, out);
-}
-
-TEST(OpAbsTest, SanityCheck) {
+class OpAbsTest : public OperatorTest {
+ protected:
+  Tensor& op_abs_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::abs_outf(context_, self, out);
+  }
+};
+
+TEST_F(OpAbsTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor in = tf.make({1, 7}, {-3.0, -2.5, -1.01, 0.0, 1.01, 2.5, 3.0});
diff --git a/kernels/test/op_acos_test.cpp b/kernels/test/op_acos_test.cpp
index 78bf257f8b..9c9c9211be 100644
--- a/kernels/test/op_acos_test.cpp
+++ b/kernels/test/op_acos_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_acos_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::acos_outf(context, self, out);
-}
+class OpAcosOutTest : public OperatorTest {
+ protected:
+  Tensor& op_acos_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::acos_outf(context_, self, out);
+  }
+
+  // Common testing for acos operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_acos_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the acos operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_acos_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
+
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 6}, { 1.570796, 0.000000, NAN, NAN, NAN, NAN }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_acos_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_acos_out(in, out));
+  }
+};
 
-TEST(OpAcosOutKernelTest, HandleBoolInput) {
+TEST_F(OpAcosOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpAcosOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_acos_out(a, out), res);
 }
 
-// Common testing for acos operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_acos_out(
-    const std::vector<int32_t>& out_shape = {1, 6},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the acos operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_acos_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 6}, { 1.570796, 0.000000, NAN, NAN, NAN, NAN }));
-  // clang-format on
-}
-
-TEST(OpAcosOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpAcosOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpAcosOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAcosOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpAcosOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_acos_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAcosOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpAcosOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_acos_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAcosOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpAcosOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpAcosOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAcosOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpAcosOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                      \
   test_floating_point_acos_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpAcosOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAcosOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpAcosOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                       \
   test_floating_point_acos_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpAcosOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAcosOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpAcosOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpAcosOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAcosOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpAcosOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpAcosOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_acos_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_acos_out(in, out));
-}
-
-TEST(OpAcosOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpAcosOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_acos_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpAcosOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpAcosOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpAcosOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpAcosOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_acos_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_acos_out(a, out));
 }
diff --git a/kernels/test/op_acosh_test.cpp b/kernels/test/op_acosh_test.cpp
index 69d3c8c790..ce01411fd3 100644
--- a/kernels/test/op_acosh_test.cpp
+++ b/kernels/test/op_acosh_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_acosh_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::acosh_outf(context, self, out);
-}
+class OpAcoshOutTest : public OperatorTest {
+ protected:
+  Tensor& op_acosh_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::acosh_outf(context_, self, out);
+  }
+
+  // Common testing for acosh operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_acosh_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the acosh operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_acosh_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 6}, { NAN, 0.000000, 1.762747, 2.292432, 2.993223, 5.298292 }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_acosh_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_acosh_out(in, out));
+  }
+};
 
-TEST(OpAcoshOutKernelTest, HandleBoolInput) {
+TEST_F(OpAcoshOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,42 +76,21 @@ TEST(OpAcoshOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_acosh_out(a, out), res);
 }
 
-// Common testing for acosh operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_acosh_out(
-    const std::vector<int32_t>& out_shape = {1, 6},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the acosh operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_acosh_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 6}, { NAN, 0.000000, 1.762747, 2.292432, 2.993223, 5.298292 }));
-  // clang-format on
-}
-
-TEST(OpAcoshOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpAcoshOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAcoshOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpAcoshOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAcoshOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpAcoshOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                       \
   test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -82,7 +98,7 @@ TEST(OpAcoshOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAcoshOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpAcoshOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                        \
   test_floating_point_acosh_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -90,7 +106,7 @@ TEST(OpAcoshOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAcoshOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpAcoshOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -101,7 +117,7 @@ TEST(OpAcoshOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAcoshOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpAcoshOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -112,21 +128,7 @@ TEST(OpAcoshOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_acosh_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_acosh_out(in, out));
-}
-
-TEST(OpAcoshOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpAcoshOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_acosh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -134,7 +136,7 @@ TEST(OpAcoshOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpAcoshOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpAcoshOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -144,5 +146,5 @@ TEST(OpAcoshOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_acosh_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_acosh_out(a, out));
 }
diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp
index 834205eff6..50e11002c7 100644
--- a/kernels/test/op_add_test.cpp
+++ b/kernels/test/op_add_test.cpp
@@ -24,115 +24,119 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_add_out(
-    const Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::add_outf(context, self, other, alpha, out);
-}
-
-Tensor& op_add_scalar_out(
-    const Tensor& self,
-    const Scalar& other,
-    const Scalar& alpha,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::add_outf(context, self, other, alpha, out);
-}
+class OpAddOutKernelTest : public OperatorTest {
+ protected:
+  Tensor& op_add_out(
+      const Tensor& self,
+      const Tensor& other,
+      const Scalar& alpha,
+      Tensor& out) {
+    return torch::executor::aten::add_outf(context_, self, other, alpha, out);
+  }
 
-template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
-void test_add() {
-  TensorFactory<DTYPE_A> tf_a;
-  TensorFactory<DTYPE_B> tf_b;
-  TensorFactory<DTYPE_OUT> tf_out;
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
+  void test_add() {
+    TensorFactory<DTYPE_A> tf_a;
+    TensorFactory<DTYPE_B> tf_b;
+    TensorFactory<DTYPE_OUT> tf_out;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the sum.
-  Tensor out = tf_out.zeros(sizes);
+    // Destination for the sum.
+    Tensor out = tf_out.zeros(sizes);
 
-  // Add two tensors.
-  op_add_out(
-      tf_a.make(sizes, /*data=*/{1, 2, 4, 8}),
-      tf_b.ones(sizes),
-      /*alpha=*/1,
-      out);
+    // Add two tensors.
+    op_add_out(
+        tf_a.make(sizes, /*data=*/{1, 2, 4, 8}),
+        tf_b.ones(sizes),
+        /*alpha=*/1,
+        out);
 
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{2, 3, 5, 9}));
-}
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{2, 3, 5, 9}));
+  }
 
-template <ScalarType DTYPE_A, ScalarType DTYPE_B>
-void test_add_enumerate_out_types() {
-  test_add<DTYPE_A, DTYPE_B, ScalarType::Half>();
-  test_add<DTYPE_A, DTYPE_B, ScalarType::Float>();
-  test_add<DTYPE_A, DTYPE_B, ScalarType::Double>();
-  // Integral out type is only allowed if both inputs are integral types
-  if (isIntegralType(DTYPE_A, false) && isIntegralType(DTYPE_B, false)) {
-    test_add<DTYPE_A, DTYPE_B, ScalarType::Int>();
-    test_add<DTYPE_A, DTYPE_B, ScalarType::Long>();
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B>
+  void test_add_enumerate_out_types() {
+    test_add<DTYPE_A, DTYPE_B, ScalarType::Half>();
+    test_add<DTYPE_A, DTYPE_B, ScalarType::Float>();
+    test_add<DTYPE_A, DTYPE_B, ScalarType::Double>();
+    // Integral out type is only allowed if both inputs are integral types
+    if (isIntegralType(DTYPE_A, false) && isIntegralType(DTYPE_B, false)) {
+      test_add<DTYPE_A, DTYPE_B, ScalarType::Int>();
+      test_add<DTYPE_A, DTYPE_B, ScalarType::Long>();
+    }
   }
-}
 
-template <ScalarType DTYPE_A>
-void test_add_enumerate_b_types() {
+  template <ScalarType DTYPE_A>
+  void test_add_enumerate_b_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_add_enumerate_out_types<DTYPE_A, ScalarType::dtype>();
 
-  ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
 
-void test_add_enumerate_a_types() {
+  void test_add_enumerate_a_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_add_enumerate_b_types<ScalarType::dtype>();
 
-  ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
 
-/**
- * Uses the function templates above to test all valid combinations of inputs
- * and output dtypes
- */
-TEST(OpAddOutKernelTest, AllRealDtypesSupported) {
-  test_add_enumerate_a_types();
-}
+  // Common testing for adding two floating point Tensors.
+  template <ScalarType DTYPE>
+  void test_floating_point_add_out() {
+    TensorFactory<DTYPE> tf;
 
-// Common testing for adding two floating point Tensors.
-template <ScalarType DTYPE>
-void test_floating_point_add_out() {
-  TensorFactory<DTYPE> tf;
+    const std::vector<int32_t> sizes = {2, 2};
 
-  const std::vector<int32_t> sizes = {2, 2};
+    // Destination for the sum.
+    Tensor out = tf.zeros(sizes);
 
-  // Destination for the sum.
-  Tensor out = tf.zeros(sizes);
+    // Add two tensors.
+    op_add_out(
+        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        tf.ones(sizes),
+        /*alpha=*/1.1,
+        out);
 
-  // Add two tensors.
-  op_add_out(
-      tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
-      tf.ones(sizes),
-      /*alpha=*/1.1,
-      out);
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{2.2, 3.3, 5.5, 9.9}));
+  }
+};
+
+class OpAddScalarOutKernelTest : public OperatorTest {
+ protected:
+  Tensor& op_add_scalar_out(
+      const Tensor& self,
+      const Scalar& other,
+      const Scalar& alpha,
+      Tensor& out) {
+    return torch::executor::aten::add_outf(context_, self, other, alpha, out);
+  }
+};
 
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{2.2, 3.3, 5.5, 9.9}));
+/**
+ * Uses the function templates above to test all valid combinations of inputs
+ * and output dtypes
+ */
+TEST_F(OpAddOutKernelTest, AllRealDtypesSupported) {
+  test_add_enumerate_a_types();
 }
 
-TEST(OpAddOutKernelTest, FloatTensors) {
+TEST_F(OpAddOutKernelTest, FloatTensors) {
   test_floating_point_add_out<ScalarType::Float>();
 }
 
-TEST(OpAddOutKernelTest, DoubleTensors) {
+TEST_F(OpAddOutKernelTest, DoubleTensors) {
   test_floating_point_add_out<ScalarType::Double>();
 }
 
-TEST(OpAddOutKernelTest, BoolAndIntInputTensor) {
+TEST_F(OpAddOutKernelTest, BoolAndIntInputTensor) {
   TensorFactory<ScalarType::Bool> tf;
   TensorFactory<ScalarType::Int> tfi;
 
@@ -147,7 +151,7 @@ TEST(OpAddOutKernelTest, BoolAndIntInputTensor) {
   EXPECT_TENSOR_EQ(out, tfi.make(sizes, {2, 5, 3, 4}));
 }
 
-TEST(OpAddOutKernelTest, BoolAndBoolInputTensor) {
+TEST_F(OpAddOutKernelTest, BoolAndBoolInputTensor) {
   et_pal_init();
   TensorFactory<ScalarType::Bool> tf;
 
@@ -162,7 +166,7 @@ TEST(OpAddOutKernelTest, BoolAndBoolInputTensor) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, {false, true, true, true}));
 }
 
-TEST(OpAddOutKernelTest, BroadcastDimSizeIsOneAB) {
+TEST_F(OpAddOutKernelTest, BroadcastDimSizeIsOneAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -188,7 +192,7 @@ TEST(OpAddOutKernelTest, BroadcastDimSizeIsOneAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddOutKernelTest, BroadcastDimSizeMissingAB) {
+TEST_F(OpAddOutKernelTest, BroadcastDimSizeMissingAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -214,7 +218,7 @@ TEST(OpAddOutKernelTest, BroadcastDimSizeMissingAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddOutKernelTest, BroadcastDimSizeIsOneBA) {
+TEST_F(OpAddOutKernelTest, BroadcastDimSizeIsOneBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.7453382015228271, 0.3131374716758728});
@@ -240,7 +244,7 @@ TEST(OpAddOutKernelTest, BroadcastDimSizeIsOneBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddOutKernelTest, BroadcastDimSizeMissingBA) {
+TEST_F(OpAddOutKernelTest, BroadcastDimSizeMissingBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.7453382015228271, 0.3131374716758728});
@@ -266,7 +270,7 @@ TEST(OpAddOutKernelTest, BroadcastDimSizeMissingBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddOutKernelTest, BroadcastSupported) {
+TEST_F(OpAddOutKernelTest, BroadcastSupported) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -288,7 +292,7 @@ TEST(OpAddOutKernelTest, BroadcastSupported) {
 // Death Tests
 //
 
-TEST(OpAddOutKernelTest, IntInputsFloatAlphaDies) {
+TEST_F(OpAddOutKernelTest, IntInputsFloatAlphaDies) {
   // op_add_out() doesn't handle floating alpha for intergal inputs
   TensorFactory<ScalarType::Int> tf;
 
@@ -300,10 +304,10 @@ TEST(OpAddOutKernelTest, IntInputsFloatAlphaDies) {
   // Elementwise add operation on two integral tensor with floating alpha
   // should cause an assertion and kill the test process.
   ET_EXPECT_KERNEL_FAILURE(
-      op_add_out(tf.ones(sizes), tf.ones(sizes), /*alpha=*/.7, out));
+      context_, op_add_out(tf.ones(sizes), tf.ones(sizes), /*alpha=*/.7, out));
 }
 
-TEST(OpAddOutKernelTest, BoolInputsFloatAlphaDies) {
+TEST_F(OpAddOutKernelTest, BoolInputsFloatAlphaDies) {
   // op_add_out() doesn't handle floating alpha for intergal inputs
   TensorFactory<ScalarType::Bool> tf;
 
@@ -315,10 +319,10 @@ TEST(OpAddOutKernelTest, BoolInputsFloatAlphaDies) {
   // Elementwise add operation on two integral tensor with floating alpha
   // should cause an assertion and kill the test process.
   ET_EXPECT_KERNEL_FAILURE(
-      op_add_out(tf.ones(sizes), tf.ones(sizes), /*alpha=*/.7, out));
+      context_, op_add_out(tf.ones(sizes), tf.ones(sizes), /*alpha=*/.7, out));
 }
 
-TEST(OpAddOutKernelTest, IntOutputWithFloatInputDies) {
+TEST_F(OpAddOutKernelTest, IntOutputWithFloatInputDies) {
   TensorFactory<ScalarType::Int> tfi;
   TensorFactory<ScalarType::Float> tff;
 
@@ -331,10 +335,10 @@ TEST(OpAddOutKernelTest, IntOutputWithFloatInputDies) {
   // Destination for the sum.
   Tensor out = tfi.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_add_out(a, b, /*alpha=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_add_out(a, b, /*alpha=*/1, out));
 }
 
-TEST(OpAddOutKernelTest, BoolOutputWithIntegralInput) {
+TEST_F(OpAddOutKernelTest, BoolOutputWithIntegralInput) {
   // op_add_out() doesn't handle Bool.
   TensorFactory<ScalarType::Bool> tf;
   TensorFactory<ScalarType::Int> tfi;
@@ -348,10 +352,10 @@ TEST(OpAddOutKernelTest, BoolOutputWithIntegralInput) {
   // Destination for the sum.
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_add_out(a, b, /*alpha=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_add_out(a, b, /*alpha=*/1, out));
 }
 
-TEST(OpAddOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpAddOutKernelTest, MismatchedInputShapesDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Addends with different shapes.
@@ -363,10 +367,10 @@ TEST(OpAddOutKernelTest, MismatchedInputShapesDies) {
 
   // Adding the two mismatched tensors should cause an assertion and kill the
   // test process.
-  ET_EXPECT_KERNEL_FAILURE(op_add_out(a, b, /*unused=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_add_out(a, b, /*unused=*/0, out));
 }
 
-TEST(OpAddOutKernelTest, MismatchedOutputShapesDies) {
+TEST_F(OpAddOutKernelTest, MismatchedOutputShapesDies) {
   if (SupportedFeatures::get()->output_resize) {
     GTEST_SKIP()
         << "The current kernel supports implicitly resizing output tensor";
@@ -385,10 +389,10 @@ TEST(OpAddOutKernelTest, MismatchedOutputShapesDies) {
 
   // Adding the tensors into a mismatched output should cause an assertion and
   // kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_add_out(a, b, /*unused=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_add_out(a, b, /*unused=*/0, out));
 }
 
-TEST(OpAddOutKernelTest, SimpleGeneratedCase) {
+TEST_F(OpAddOutKernelTest, SimpleGeneratedCase) {
   et_pal_init();
 
   TensorFactory<ScalarType::Float> tf;
@@ -429,7 +433,7 @@ TEST(OpAddOutKernelTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpAddOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -463,7 +467,7 @@ TEST(OpAddOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpAddOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -497,7 +501,7 @@ TEST(OpAddOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpAddOutKernelTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -532,7 +536,7 @@ TEST(OpAddOutKernelTest, DynamicShapeUnbound) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddScalarOutKernelTest, SanityCheck) {
+TEST_F(OpAddScalarOutKernelTest, SanityCheck) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -545,7 +549,7 @@ TEST(OpAddScalarOutKernelTest, SanityCheck) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, {3, 4, 6, 10}));
 }
 
-TEST(OpAddScalarOutKernelTest, OptimizedSanityCheck) {
+TEST_F(OpAddScalarOutKernelTest, OptimizedSanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
diff --git a/kernels/test/op_addmm_test.cpp b/kernels/test/op_addmm_test.cpp
index a5425183c8..b8f33289fc 100644
--- a/kernels/test/op_addmm_test.cpp
+++ b/kernels/test/op_addmm_test.cpp
@@ -24,19 +24,51 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_addmm_out(
-    const Tensor& self,
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Scalar& beta,
-    const Scalar& alpha,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::addmm_outf(
-      context, self, mat1, mat2, beta, alpha, out);
-}
+class OpAddmmOutTest : public OperatorTest {
+ protected:
+  Tensor& op_addmm_out(
+      const Tensor& self,
+      const Tensor& mat1,
+      const Tensor& mat2,
+      const Scalar& beta,
+      const Scalar& alpha,
+      Tensor& out) {
+    return torch::executor::aten::addmm_outf(
+        context_, self, mat1, mat2, beta, alpha, out);
+  }
+
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+      if (DTYPE == ScalarType::Half) {
+        GTEST_SKIP()
+            << "skip Half because torch::executor::aten::mm_out does not support Half";
+        return;
+      }
+    }
+
+    // matmul gives 4 * 2 * 3 = 24, α * 24 = 48, 48 + β * self = 51
+    Tensor self = tf.full({3, 5}, 1);
+    Tensor x = tf.full({3, 4}, 2);
+    Tensor y = tf.full({4, 5}, 3);
+
+    // Output shape should be (3, 5)
+    Tensor out = tf.zeros({3, 5});
 
-TEST(OpAddmmOutTest, OutputDim) {
+    Scalar alpha = Scalar(2.0);
+    Scalar beta = Scalar(3.0);
+
+    op_addmm_out(self, x, y, beta, alpha, out);
+
+    Tensor expected = tf.full({3, 5}, 51);
+
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpAddmmOutTest, OutputDim) {
   TensorFactory<ScalarType::Int> tf;
 
   // 3 tensors with compatible dimensions: (3, 5), (3, 4) and (4, 5).
@@ -63,37 +95,7 @@ TEST(OpAddmmOutTest, OutputDim) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    if (DTYPE == ScalarType::Half) {
-      GTEST_SKIP()
-          << "skip Half because torch::executor::aten::mm_out does not support Half";
-      return;
-    }
-  }
-
-  // matmul gives 4 * 2 * 3 = 24, α * 24 = 48, 48 + β * self = 51
-  Tensor self = tf.full({3, 5}, 1);
-  Tensor x = tf.full({3, 4}, 2);
-  Tensor y = tf.full({4, 5}, 3);
-
-  // Output shape should be (3, 5)
-  Tensor out = tf.zeros({3, 5});
-
-  Scalar alpha = Scalar(2.0);
-  Scalar beta = Scalar(3.0);
-
-  op_addmm_out(self, x, y, beta, alpha, out);
-
-  Tensor expected = tf.full({3, 5}, 51);
-
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpAddmmOutTest, AllDtypesSupported) {
+TEST_F(OpAddmmOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Half, TEST_ENTRY);
 #undef TEST_ENTRY
@@ -102,7 +104,7 @@ TEST(OpAddmmOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpAddmmOutTest, EmptyInputWithEmptyOutTensorPasses) {
+TEST_F(OpAddmmOutTest, EmptyInputWithEmptyOutTensorPasses) {
   TensorFactory<ScalarType::Float> tf;
 
   // Empty input matrices
@@ -119,7 +121,7 @@ TEST(OpAddmmOutTest, EmptyInputWithEmptyOutTensorPasses) {
       op_addmm_out(self, x, y, Scalar(2), Scalar(3), out), expected);
 }
 
-TEST(OpAddmmOutTest, FloatTensorDtypeAndIntScalarTypePasses) {
+TEST_F(OpAddmmOutTest, FloatTensorDtypeAndIntScalarTypePasses) {
   // case 1: Tensor dtype float, scalar type int
   TensorFactory<ScalarType::Float> tff;
   // matmul gives 4 * 2 * 3 = 24, α * 24 = 72, 72 + β * self = 74
@@ -136,7 +138,7 @@ TEST(OpAddmmOutTest, FloatTensorDtypeAndIntScalarTypePasses) {
       op_addmm_out(self, x, y, Scalar(2), Scalar(3), out), expected);
 }
 
-TEST(OpAddmmOutTest, IntTensorDtypeAndFloatScalarTypePasses) {
+TEST_F(OpAddmmOutTest, IntTensorDtypeAndFloatScalarTypePasses) {
   // case 2: Tensor dtype int, scalar type loat
   TensorFactory<ScalarType::Int> tfi;
   // matmul gives 4 * 2 * 3 = 24, α * 24 = 72, 72 + β * self = 74
@@ -153,7 +155,7 @@ TEST(OpAddmmOutTest, IntTensorDtypeAndFloatScalarTypePasses) {
       op_addmm_out(self, x, y, Scalar(2.0), Scalar(3.0), out), expected);
 }
 
-TEST(OpAddmmOutTest, InfinityTensorAndFloatScalarTypePasses) {
+TEST_F(OpAddmmOutTest, InfinityTensorAndFloatScalarTypePasses) {
   // case 2: Tensor dtype int, scalar type loat
   TensorFactory<ScalarType::Float> tff;
 
@@ -170,7 +172,7 @@ TEST(OpAddmmOutTest, InfinityTensorAndFloatScalarTypePasses) {
       op_addmm_out(self, x, y, Scalar(2), Scalar(3), out), expected);
 }
 
-TEST(OpAddmmOutTest, MismatchedDimensionsDies) {
+TEST_F(OpAddmmOutTest, MismatchedDimensionsDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor self = tf.full({2, 2}, 3);
@@ -184,13 +186,13 @@ TEST(OpAddmmOutTest, MismatchedDimensionsDies) {
 
   Tensor expected = tf.full({2, 2}, 9);
   ET_EXPECT_KERNEL_FAILURE(
-      op_addmm_out(self, x, wrong_y, Scalar(1), Scalar(1), out));
+      context_, op_addmm_out(self, x, wrong_y, Scalar(1), Scalar(1), out));
 
   EXPECT_TENSOR_EQ(
       op_addmm_out(self, x, right_y, Scalar(1), Scalar(1), out), expected);
 }
 
-TEST(OpAddmmOutTest, MismatchedDimensionSizeDies) {
+TEST_F(OpAddmmOutTest, MismatchedDimensionSizeDies) {
   TensorFactory<ScalarType::Int> tf;
   Tensor self = tf.full({2, 2}, 3);
   Tensor x = tf.full({2, 2}, 3);
@@ -208,12 +210,14 @@ TEST(OpAddmmOutTest, MismatchedDimensionSizeDies) {
   }
 
   ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_addmm_out(self, x, right_y, Scalar(1), Scalar(1), wrong_out));
   ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_addmm_out(self, x, wrong_y, Scalar(1), Scalar(1), right_out));
 }
 
-TEST(OpAddmmOutTest, WrongOutShapeDies) {
+TEST_F(OpAddmmOutTest, WrongOutShapeDies) {
   TensorFactory<ScalarType::Int> tf;
   Tensor self = tf.ones({10, 4});
   Tensor x = tf.ones({10, 3});
@@ -229,14 +233,14 @@ TEST(OpAddmmOutTest, WrongOutShapeDies) {
   }
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_addmm_out(self, x, y, Scalar(1), Scalar(1), wrong_out));
+      context_, op_addmm_out(self, x, y, Scalar(1), Scalar(1), wrong_out));
 
   EXPECT_TENSOR_EQ(
       op_addmm_out(self, x, y, Scalar(1), Scalar(1), right_out),
       tf.full({10, 4}, 4));
 }
 
-TEST(OpAddmmOutTest, BroadcastTest) {
+TEST_F(OpAddmmOutTest, BroadcastTest) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor self = tf.make({1}, {1});
@@ -249,7 +253,7 @@ TEST(OpAddmmOutTest, BroadcastTest) {
       op_addmm_out(self, x, y, Scalar(1), Scalar(1), out),
       tf.make({2, 2}, {8, 11, 16, 23}));
 }
-TEST(OpAddmmOutKernelTest, BroadcastDimSize1) {
+TEST_F(OpAddmmOutTest, BroadcastDimSize1) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.9937992691993713, 0.7011417150497437});
@@ -301,7 +305,7 @@ TEST(OpAddmmOutKernelTest, BroadcastDimSize1) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddmmOutKernelTest, BroadcastDimSizeMissing) {
+TEST_F(OpAddmmOutTest, BroadcastDimSizeMissing) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({2}, {0.9937992691993713, 0.7011417150497437});
@@ -353,7 +357,7 @@ TEST(OpAddmmOutKernelTest, BroadcastDimSizeMissing) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddmmOutKernelTest, BroadcastDimSizeIsOne) {
+TEST_F(OpAddmmOutTest, BroadcastDimSizeIsOne) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.9093303680419922, 0.37621551752090454});
@@ -405,7 +409,7 @@ TEST(OpAddmmOutKernelTest, BroadcastDimSizeIsOne) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddmmOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpAddmmOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -465,7 +469,7 @@ TEST(OpAddmmOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddmmOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpAddmmOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -525,7 +529,7 @@ TEST(OpAddmmOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpAddmmOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpAddmmOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_alias_copy_test.cpp b/kernels/test/op_alias_copy_test.cpp
index 15e2018e7d..daa8c52dfb 100644
--- a/kernels/test/op_alias_copy_test.cpp
+++ b/kernels/test/op_alias_copy_test.cpp
@@ -17,9 +17,11 @@
 
 using namespace ::testing;
 
-exec_aten::Tensor& op_alias_copy_out(
-    const exec_aten::Tensor& self,
-    exec_aten::Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::alias_copy_outf(context, self, out);
-}
+class OpAliasCopyTest : public OperatorTest {
+ protected:
+  exec_aten::Tensor& op_alias_copy_out(
+      const exec_aten::Tensor& self,
+      exec_aten::Tensor& out) {
+    return torch::executor::aten::alias_copy_outf(context_, self, out);
+  }
+};
diff --git a/kernels/test/op_amax_test.cpp b/kernels/test/op_amax_test.cpp
index a87423843b..77f0b7da1d 100644
--- a/kernels/test/op_amax_test.cpp
+++ b/kernels/test/op_amax_test.cpp
@@ -23,46 +23,212 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_amax_out(
-    const Tensor& in,
-    ArrayRef<int64_t> dim,
-    bool keepdim,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::amax_outf(context, in, dim, keepdim, out);
-}
-
-template <ScalarType DTYPE>
-void test_amax_out_invalid_dimensions() {
-  TensorFactory<DTYPE> tf;
+class OpAmaxOutTest : public OperatorTest {
+ protected:
+  Tensor& op_amax_out(
+      const Tensor& in,
+      ArrayRef<int64_t> dim,
+      bool keepdim,
+      Tensor& out) {
+    return torch::executor::aten::amax_outf(context_, in, dim, keepdim, out);
+  }
 
-  // clang-format off
-  Tensor in = tf.make(
-    {2, 3, 4},
-    {
-      0, 1, 2, 4,
-      4, 2, 1, 0,
-      1, 0, 4, 2,
+  template <ScalarType DTYPE>
+  void test_amax_out_invalid_dimensions() {
+    TensorFactory<DTYPE> tf;
+
+    // clang-format off
+    Tensor in = tf.make(
+      {2, 3, 4},
+      {
+        0, 1, 2, 4,
+        4, 2, 1, 0,
+        1, 0, 4, 2,
+
+        4, 2, 1, 0,
+        0, 1, 2, 4,
+        1, 0, 4, 2,
+      });
+    // clang-format on
+    Tensor out = tf.zeros({2, 3, 1});
+
+    // out-of-bound dim in dim list
+    int64_t dims_1[1] = {3};
+    ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_amax_out(in, dim_list, /*keepdim=*/true, out));
+
+    // the same dim appears multiple times in list of dims
+    int64_t dims_2[2] = {2, 2};
+    dim_list = ArrayRef<int64_t>{dims_2, 2};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_amax_out(in, dim_list, /*keepdim=*/true, out));
+  }
 
-      4, 2, 1, 0,
-      0, 1, 2, 4,
-      1, 0, 4, 2,
-    });
-  // clang-format on
-  Tensor out = tf.zeros({2, 3, 1});
+  template <ScalarType DTYPE>
+  void test_amax_out_invalid_shape() {
+    TensorFactory<DTYPE> tf;
+
+    // clang-format off
+    Tensor in = tf.make(
+      {2, 3, 4},
+      {
+        0, 1, 2, 4,
+        4, 2, 1, 0,
+        1, 0, 4, 2,
+
+        4, 2, 1, 0,
+        0, 1, 2, 4,
+        1, 0, 4, 2,
+      });
+    // clang-format on
+
+    // dimension size mismatch when keepdim is true
+    Tensor out = tf.zeros({2, 4});
+
+    int64_t dims_1[1] = {1};
+    ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_amax_out(in, dim_list, /*keepdim=*/true, out));
+
+    // dimension size mismatch when keepdim is false
+    out = tf.zeros({2, 1, 4});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_amax_out(in, dim_list, /*keepdim=*/false, out));
+  }
 
-  // out-of-bound dim in dim list
-  int64_t dims_1[1] = {3};
-  ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(op_amax_out(in, dim_list, /*keepdim=*/true, out), "");
+  template <ScalarType DTYPE>
+  void test_amax_out_dtype() {
+    TensorFactory<DTYPE> tf;
+    // clang-format off
+    Tensor in = tf.make(
+      {2, 3, 4},
+      {
+        0, 1, 2, 4,
+        4, 2, 1, 0,
+        1, 5, 4, 2,
+
+        4, 2, 1, 0,
+        5, 1, 2, 4,
+        7, 5, 4, 2,
+      });
+    // clang-format on
+
+    // keepdim=true should work
+    Tensor out = tf.zeros({2, 3, 1});
+    int64_t dims_1[1] = {2};
+    ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
+
+    op_amax_out(in, dim_list, /*keepdim=*/true, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf.make(
+      {2, 3, 1},
+      {4, 4, 5, 4, 5, 7}));
+    // clang-format on
+
+    // keepdim=false should work
+    out = tf.zeros({2, 3});
+    op_amax_out(in, dim_list, /*keepdim=*/false, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf.make(
+      {2, 3},
+      {4, 4, 5, 4, 5, 7}));
+    // clang-format on
+
+    // dim list with multiple dimensions should work
+    out = tf.zeros({1, 1, 4});
+    int64_t dims_2[2] = {0, 1};
+    dim_list = ArrayRef<int64_t>{dims_2, 2};
+    op_amax_out(in, dim_list, /*keepdim=*/true, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 4}, {7, 5, 4, 4}));
+
+    out = tf.zeros({4});
+    op_amax_out(in, dim_list, /*keepdim=*/false, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({4}, {7, 5, 4, 4}));
+
+    // dim list with negative dimensions should work
+    out = tf.zeros({2, 1, 4});
+    int64_t dims_3[1] = {-2};
+    dim_list = ArrayRef<int64_t>{dims_3, 1};
+    op_amax_out(in, dim_list, /*keepdim=*/true, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf.make(
+      {2, 1, 4},
+      {
+        4, 5, 4, 4,
+
+        7, 5, 4, 4,
+      }));
+    // clang-format on
+
+    // empty/null dim list should work
+    // clang-format off
+    in = tf.make(
+      {2, 2, 4},
+      {
+        8, 7, 5, 4,
+        4, 3, 7, 9,
+
+        4, 2, 6, 8,
+        8, 7, 3, 4,
+      });
+    // clang-format on
+    out = tf.zeros({1, 1, 1});
+    ArrayRef<int64_t> null_dim_list;
+    op_amax_out(in, null_dim_list, /*keepdim=*/true, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 1}, {9}));
+
+    ArrayRef<int64_t> empty_dim_list{ArrayRef<int64_t>{}};
+    op_amax_out(in, empty_dim_list, /*keepdim=*/true, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 1}, {9}));
+
+    out = tf.zeros({});
+    op_amax_out(in, null_dim_list, /*keepdim=*/false, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({}, {9}));
+
+    op_amax_out(in, empty_dim_list, /*keepdim=*/false, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({}, {9}));
+  }
 
-  // the same dim appears multiple times in list of dims
-  int64_t dims_2[2] = {2, 2};
-  dim_list = ArrayRef<int64_t>{dims_2, 2};
-  ET_EXPECT_DEATH(op_amax_out(in, dim_list, /*keepdim=*/true, out), "");
-}
+  template <>
+  void test_amax_out_dtype<ScalarType::Bool>() {
+    TensorFactory<ScalarType::Bool> tf_bool;
+    // clang-format off
+    Tensor in = tf_bool.make(
+      {2, 3, 4},
+      {
+        true,  false, true,  false,
+        false, false, false, false,
+        false, true,  true,  false,
+
+        false, false, true,  false,
+        false, false, false, true,
+        true,  true,  true,  true,
+      });
+    // clang-format on
+
+    Tensor out = tf_bool.zeros({2, 3, 1});
+
+    // +/-inf and nan should work
+    op_amax_out(in, /*dim=*/-1, /*keepdim=*/true, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(
+        out, tf_bool.make(
+          {2, 3, 1},
+          {
+            true,
+            false,
+            true,
+
+            true,
+            true,
+            true
+          }));
+    // clang-format on
+  }
+};
 
-TEST(OpAmaxOutTest, InvalidDimensionListDies) {
+TEST_F(OpAmaxOutTest, InvalidDimensionListDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -72,37 +238,7 @@ TEST(OpAmaxOutTest, InvalidDimensionListDies) {
 #undef TEST_ENTRY
 }
 
-template <ScalarType DTYPE>
-void test_amax_out_invalid_shape() {
-  TensorFactory<DTYPE> tf;
-
-  // clang-format off
-  Tensor in = tf.make(
-    {2, 3, 4},
-    {
-      0, 1, 2, 4,
-      4, 2, 1, 0,
-      1, 0, 4, 2,
-
-      4, 2, 1, 0,
-      0, 1, 2, 4,
-      1, 0, 4, 2,
-    });
-  // clang-format on
-
-  // dimension size mismatch when keepdim is true
-  Tensor out = tf.zeros({2, 4});
-
-  int64_t dims_1[1] = {1};
-  ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(op_amax_out(in, dim_list, /*keepdim=*/true, out), "");
-
-  // dimension size mismatch when keepdim is false
-  out = tf.zeros({2, 1, 4});
-  ET_EXPECT_DEATH(op_amax_out(in, dim_list, /*keepdim=*/false, out), "");
-}
-
-TEST(OpAmaxOutTest, InvalidShapeDies) {
+TEST_F(OpAmaxOutTest, InvalidShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -112,7 +248,7 @@ TEST(OpAmaxOutTest, InvalidShapeDies) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAmaxOutTest, MismatchedDTypesDies) {
+TEST_F(OpAmaxOutTest, MismatchedDTypesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -138,146 +274,17 @@ TEST(OpAmaxOutTest, MismatchedDTypesDies) {
   ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
 
   // out tensor should be of the same dtype with dtype when dtype is specified
-  ET_EXPECT_DEATH(op_amax_out(in, dim_list, /*keepdim=*/true, out), "");
-}
-
-template <ScalarType DTYPE>
-void test_amax_out_dtype() {
-  TensorFactory<DTYPE> tf;
-  // clang-format off
-  Tensor in = tf.make(
-    {2, 3, 4},
-    {
-      0, 1, 2, 4,
-      4, 2, 1, 0,
-      1, 5, 4, 2,
-
-      4, 2, 1, 0,
-      5, 1, 2, 4,
-      7, 5, 4, 2,
-    });
-  // clang-format on
-
-  // keepdim=true should work
-  Tensor out = tf.zeros({2, 3, 1});
-  int64_t dims_1[1] = {2};
-  ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
-
-  op_amax_out(in, dim_list, /*keepdim=*/true, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf.make(
-    {2, 3, 1},
-    {4, 4, 5, 4, 5, 7}));
-  // clang-format on
-
-  // keepdim=false should work
-  out = tf.zeros({2, 3});
-  op_amax_out(in, dim_list, /*keepdim=*/false, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf.make(
-    {2, 3},
-    {4, 4, 5, 4, 5, 7}));
-  // clang-format on
-
-  // dim list with multiple dimensions should work
-  out = tf.zeros({1, 1, 4});
-  int64_t dims_2[2] = {0, 1};
-  dim_list = ArrayRef<int64_t>{dims_2, 2};
-  op_amax_out(in, dim_list, /*keepdim=*/true, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 4}, {7, 5, 4, 4}));
-
-  out = tf.zeros({4});
-  op_amax_out(in, dim_list, /*keepdim=*/false, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({4}, {7, 5, 4, 4}));
-
-  // dim list with negative dimensions should work
-  out = tf.zeros({2, 1, 4});
-  int64_t dims_3[1] = {-2};
-  dim_list = ArrayRef<int64_t>{dims_3, 1};
-  op_amax_out(in, dim_list, /*keepdim=*/true, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf.make(
-    {2, 1, 4},
-    {
-      4, 5, 4, 4,
-
-      7, 5, 4, 4,
-    }));
-  // clang-format on
-
-  // empty/null dim list should work
-  // clang-format off
-  in = tf.make(
-    {2, 2, 4},
-    {
-      8, 7, 5, 4,
-      4, 3, 7, 9,
-
-      4, 2, 6, 8,
-      8, 7, 3, 4,
-    });
-  // clang-format on
-  out = tf.zeros({1, 1, 1});
-  ArrayRef<int64_t> null_dim_list;
-  op_amax_out(in, null_dim_list, /*keepdim=*/true, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 1}, {9}));
-
-  ArrayRef<int64_t> empty_dim_list{ArrayRef<int64_t>{}};
-  op_amax_out(in, empty_dim_list, /*keepdim=*/true, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 1}, {9}));
-
-  out = tf.zeros({});
-  op_amax_out(in, null_dim_list, /*keepdim=*/false, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({}, {9}));
-
-  op_amax_out(in, empty_dim_list, /*keepdim=*/false, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({}, {9}));
-}
-
-template <>
-void test_amax_out_dtype<ScalarType::Bool>() {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  // clang-format off
-  Tensor in = tf_bool.make(
-    {2, 3, 4},
-    {
-      true,  false, true,  false,
-      false, false, false, false,
-      false, true,  true,  false,
-
-      false, false, true,  false,
-      false, false, false, true,
-      true,  true,  true,  true,
-    });
-  // clang-format on
-
-  Tensor out = tf_bool.zeros({2, 3, 1});
-
-  // +/-inf and nan should work
-  op_amax_out(in, /*dim=*/-1, /*keepdim=*/true, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(
-      out, tf_bool.make(
-        {2, 3, 1},
-        {
-          true,
-          false,
-          true,
-
-          true,
-          true,
-          true
-        }));
-  // clang-format on
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_amax_out(in, dim_list, /*keepdim=*/true, out));
 }
 
-TEST(OpAmaxOutTest, AllRealInputOutputPasses) {
+TEST_F(OpAmaxOutTest, AllRealInputOutputPasses) {
 #define TEST_ENTRY(ctype, dtype) test_amax_out_dtype<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAmaxOutTest, InfinityAndNANTest) {
+TEST_F(OpAmaxOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Float> tf_float;
   // clang-format off
   Tensor in = tf_float.make(
diff --git a/kernels/test/op_amin_test.cpp b/kernels/test/op_amin_test.cpp
index 4ab76a5bb7..218ee3af74 100644
--- a/kernels/test/op_amin_test.cpp
+++ b/kernels/test/op_amin_test.cpp
@@ -23,46 +23,212 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_amin_out(
-    const Tensor& in,
-    ArrayRef<int64_t> dim,
-    bool keepdim,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::amin_outf(context, in, dim, keepdim, out);
-}
-
-template <ScalarType DTYPE>
-void test_amin_out_invalid_dimensions() {
-  TensorFactory<DTYPE> tf;
+class OpAminOutTest : public OperatorTest {
+ protected:
+  Tensor& op_amin_out(
+      const Tensor& in,
+      ArrayRef<int64_t> dim,
+      bool keepdim,
+      Tensor& out) {
+    return torch::executor::aten::amin_outf(context_, in, dim, keepdim, out);
+  }
 
-  // clang-format off
-  Tensor in = tf.make(
-    {2, 3, 4},
-    {
-      0, 1, 2, 4,
-      4, 2, 1, 0,
-      1, 0, 4, 2,
+  template <ScalarType DTYPE>
+  void test_amin_out_invalid_dimensions() {
+    TensorFactory<DTYPE> tf;
+
+    // clang-format off
+    Tensor in = tf.make(
+      {2, 3, 4},
+      {
+        0, 1, 2, 4,
+        4, 2, 1, 0,
+        1, 0, 4, 2,
+
+        4, 2, 1, 0,
+        0, 1, 2, 4,
+        1, 0, 4, 2,
+      });
+    // clang-format on
+    Tensor out = tf.zeros({2, 3, 1});
+
+    // out-of-bound dim in dim list
+    int64_t dims_1[1] = {3};
+    ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_amin_out(in, dim_list, /*keepdim=*/true, out));
+
+    // the same dim appears multiple times in list of dims
+    int64_t dims_2[2] = {2, 2};
+    dim_list = ArrayRef<int64_t>{dims_2, 2};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_amin_out(in, dim_list, /*keepdim=*/true, out));
+  }
 
-      4, 2, 1, 0,
-      0, 1, 2, 4,
-      1, 0, 4, 2,
-    });
-  // clang-format on
-  Tensor out = tf.zeros({2, 3, 1});
+  template <ScalarType DTYPE>
+  void test_amin_out_invalid_shape() {
+    TensorFactory<DTYPE> tf;
+
+    // clang-format off
+    Tensor in = tf.make(
+      {2, 3, 4},
+      {
+        0, 1, 2, 4,
+        4, 2, 1, 0,
+        1, 0, 4, 2,
+
+        4, 2, 1, 0,
+        0, 1, 2, 4,
+        1, 0, 4, 2,
+      });
+    // clang-format on
+
+    // dimension size mismatch when keepdim is true
+    Tensor out = tf.zeros({2, 4});
+
+    int64_t dims_1[1] = {1};
+    ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_amin_out(in, dim_list, /*keepdim=*/true, out));
+
+    // dimension size mismatch when keepdim is false
+    out = tf.zeros({2, 1, 4});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_amin_out(in, dim_list, /*keepdim=*/false, out));
+  }
 
-  // out-of-bound dim in dim list
-  int64_t dims_1[1] = {3};
-  ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(op_amin_out(in, dim_list, /*keepdim=*/true, out), "");
+  template <ScalarType DTYPE>
+  void test_amin_out_dtype() {
+    TensorFactory<DTYPE> tf;
+    // clang-format off
+    Tensor in = tf.make(
+      {2, 3, 4},
+      {
+        0, 1, 2, 4,
+        4, 2, 1, 0,
+        1, 5, 4, 2,
+
+        4, 2, 1, 0,
+        5, 1, 2, 4,
+        7, 5, 4, 2,
+      });
+    // clang-format on
+
+    // keepdim=true should work
+    Tensor out = tf.zeros({2, 3, 1});
+    int64_t dims_1[1] = {2};
+    ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
+
+    op_amin_out(in, dim_list, /*keepdim=*/true, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf.make(
+      {2, 3, 1},
+      {0, 0, 1, 0, 1, 2}));
+    // clang-format on
+
+    // keepdim=false should work
+    out = tf.zeros({2, 3});
+    op_amin_out(in, dim_list, /*keepdim=*/false, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf.make(
+      {2, 3},
+      {0, 0, 1, 0, 1, 2}));
+    // clang-format on
+
+    // dim list with multiple dimensions should work
+    out = tf.zeros({1, 1, 4});
+    int64_t dims_2[2] = {0, 1};
+    dim_list = ArrayRef<int64_t>{dims_2, 2};
+    op_amin_out(in, dim_list, /*keepdim=*/true, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 4}, {0, 1, 1, 0}));
+
+    out = tf.zeros({4});
+    op_amin_out(in, dim_list, /*keepdim=*/false, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({4}, {0, 1, 1, 0}));
+
+    // dim list with negative dimensions should work
+    out = tf.zeros({2, 1, 4});
+    int64_t dims_3[1] = {-2};
+    dim_list = ArrayRef<int64_t>{dims_3, 1};
+    op_amin_out(in, dim_list, /*keepdim=*/true, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf.make(
+      {2, 1, 4},
+      {
+        0, 1, 1, 0,
+
+        4, 1, 1, 0,
+      }));
+    // clang-format on
+
+    // empty/null dim list should work
+    // clang-format off
+    in = tf.make(
+      {2, 2, 4},
+      {
+        8, 7, 5, 4,
+        4, 3, 7, 9,
+
+        4, 2, 6, 8,
+        8, 7, 3, 4,
+      });
+    // clang-format on
+    out = tf.zeros({1, 1, 1});
+    ArrayRef<int64_t> null_dim_list;
+    op_amin_out(in, null_dim_list, /*keepdim=*/true, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 1}, {2}));
+
+    ArrayRef<int64_t> empty_dim_list{ArrayRef<int64_t>{}};
+    op_amin_out(in, empty_dim_list, /*keepdim=*/true, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 1}, {2}));
+
+    out = tf.zeros({});
+    op_amin_out(in, null_dim_list, /*keepdim=*/false, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({}, {2}));
+
+    op_amin_out(in, empty_dim_list, /*keepdim=*/false, out);
+    EXPECT_TENSOR_CLOSE(out, tf.make({}, {2}));
+  }
 
-  // the same dim appears multiple times in list of dims
-  int64_t dims_2[2] = {2, 2};
-  dim_list = ArrayRef<int64_t>{dims_2, 2};
-  ET_EXPECT_DEATH(op_amin_out(in, dim_list, /*keepdim=*/true, out), "");
-}
+  template <>
+  void test_amin_out_dtype<ScalarType::Bool>() {
+    TensorFactory<ScalarType::Bool> tf_bool;
+    // clang-format off
+    Tensor in = tf_bool.make(
+      {2, 3, 4},
+      {
+        true,  false, true,  false,
+        false, false, false, false,
+        false, true,  true,  false,
+
+        false, false, true,  false,
+        false, false, false, true,
+        true,  true,  true,  true,
+      });
+    // clang-format on
+
+    Tensor out = tf_bool.zeros({2, 3, 1});
+
+    // +/-inf and nan should work
+    op_amin_out(in, /*dim=*/-1, /*keepdim=*/true, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(
+        out, tf_bool.make(
+          {2, 3, 1},
+          {
+            false,
+            false,
+            false,
+
+            false,
+            false,
+            true
+          }));
+    // clang-format on
+  }
+};
 
-TEST(OpAminOutTest, InvalidDimensionListDies) {
+TEST_F(OpAminOutTest, InvalidDimensionListDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -72,37 +238,7 @@ TEST(OpAminOutTest, InvalidDimensionListDies) {
 #undef TEST_ENTRY
 }
 
-template <ScalarType DTYPE>
-void test_amin_out_invalid_shape() {
-  TensorFactory<DTYPE> tf;
-
-  // clang-format off
-  Tensor in = tf.make(
-    {2, 3, 4},
-    {
-      0, 1, 2, 4,
-      4, 2, 1, 0,
-      1, 0, 4, 2,
-
-      4, 2, 1, 0,
-      0, 1, 2, 4,
-      1, 0, 4, 2,
-    });
-  // clang-format on
-
-  // dimension size mismatch when keepdim is true
-  Tensor out = tf.zeros({2, 4});
-
-  int64_t dims_1[1] = {1};
-  ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(op_amin_out(in, dim_list, /*keepdim=*/true, out), "");
-
-  // dimension size mismatch when keepdim is false
-  out = tf.zeros({2, 1, 4});
-  ET_EXPECT_DEATH(op_amin_out(in, dim_list, /*keepdim=*/false, out), "");
-}
-
-TEST(OpAminOutTest, InvalidShapeDies) {
+TEST_F(OpAminOutTest, InvalidShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -112,7 +248,7 @@ TEST(OpAminOutTest, InvalidShapeDies) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAminOutTest, MismatchedDTypesDies) {
+TEST_F(OpAminOutTest, MismatchedDTypesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -138,146 +274,17 @@ TEST(OpAminOutTest, MismatchedDTypesDies) {
   ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
 
   // out tensor should be of the same dtype with dtype when dtype is specified
-  ET_EXPECT_DEATH(op_amin_out(in, dim_list, /*keepdim=*/true, out), "");
-}
-
-template <ScalarType DTYPE>
-void test_amin_out_dtype() {
-  TensorFactory<DTYPE> tf;
-  // clang-format off
-  Tensor in = tf.make(
-    {2, 3, 4},
-    {
-      0, 1, 2, 4,
-      4, 2, 1, 0,
-      1, 5, 4, 2,
-
-      4, 2, 1, 0,
-      5, 1, 2, 4,
-      7, 5, 4, 2,
-    });
-  // clang-format on
-
-  // keepdim=true should work
-  Tensor out = tf.zeros({2, 3, 1});
-  int64_t dims_1[1] = {2};
-  ArrayRef<int64_t> dim_list{ArrayRef<int64_t>{dims_1, 1}};
-
-  op_amin_out(in, dim_list, /*keepdim=*/true, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf.make(
-    {2, 3, 1},
-    {0, 0, 1, 0, 1, 2}));
-  // clang-format on
-
-  // keepdim=false should work
-  out = tf.zeros({2, 3});
-  op_amin_out(in, dim_list, /*keepdim=*/false, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf.make(
-    {2, 3},
-    {0, 0, 1, 0, 1, 2}));
-  // clang-format on
-
-  // dim list with multiple dimensions should work
-  out = tf.zeros({1, 1, 4});
-  int64_t dims_2[2] = {0, 1};
-  dim_list = ArrayRef<int64_t>{dims_2, 2};
-  op_amin_out(in, dim_list, /*keepdim=*/true, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 4}, {0, 1, 1, 0}));
-
-  out = tf.zeros({4});
-  op_amin_out(in, dim_list, /*keepdim=*/false, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({4}, {0, 1, 1, 0}));
-
-  // dim list with negative dimensions should work
-  out = tf.zeros({2, 1, 4});
-  int64_t dims_3[1] = {-2};
-  dim_list = ArrayRef<int64_t>{dims_3, 1};
-  op_amin_out(in, dim_list, /*keepdim=*/true, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf.make(
-    {2, 1, 4},
-    {
-      0, 1, 1, 0,
-
-      4, 1, 1, 0,
-    }));
-  // clang-format on
-
-  // empty/null dim list should work
-  // clang-format off
-  in = tf.make(
-    {2, 2, 4},
-    {
-      8, 7, 5, 4,
-      4, 3, 7, 9,
-
-      4, 2, 6, 8,
-      8, 7, 3, 4,
-    });
-  // clang-format on
-  out = tf.zeros({1, 1, 1});
-  ArrayRef<int64_t> null_dim_list;
-  op_amin_out(in, null_dim_list, /*keepdim=*/true, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 1}, {2}));
-
-  ArrayRef<int64_t> empty_dim_list{ArrayRef<int64_t>{}};
-  op_amin_out(in, empty_dim_list, /*keepdim=*/true, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({1, 1, 1}, {2}));
-
-  out = tf.zeros({});
-  op_amin_out(in, null_dim_list, /*keepdim=*/false, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({}, {2}));
-
-  op_amin_out(in, empty_dim_list, /*keepdim=*/false, out);
-  EXPECT_TENSOR_CLOSE(out, tf.make({}, {2}));
-}
-
-template <>
-void test_amin_out_dtype<ScalarType::Bool>() {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  // clang-format off
-  Tensor in = tf_bool.make(
-    {2, 3, 4},
-    {
-      true,  false, true,  false,
-      false, false, false, false,
-      false, true,  true,  false,
-
-      false, false, true,  false,
-      false, false, false, true,
-      true,  true,  true,  true,
-    });
-  // clang-format on
-
-  Tensor out = tf_bool.zeros({2, 3, 1});
-
-  // +/-inf and nan should work
-  op_amin_out(in, /*dim=*/-1, /*keepdim=*/true, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(
-      out, tf_bool.make(
-        {2, 3, 1},
-        {
-          false,
-          false,
-          false,
-
-          false,
-          false,
-          true
-        }));
-  // clang-format on
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_amin_out(in, dim_list, /*keepdim=*/true, out));
 }
 
-TEST(OpAminOutTest, AllRealInputOutputPasses) {
+TEST_F(OpAminOutTest, AllRealInputOutputPasses) {
 #define TEST_ENTRY(ctype, dtype) test_amin_out_dtype<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAminOutTest, InfinityAndNANTest) {
+TEST_F(OpAminOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Float> tf_float;
   // clang-format off
   Tensor in = tf_float.make(
diff --git a/kernels/test/op_any_test.cpp b/kernels/test/op_any_test.cpp
index 9c4d5a9675..09f9cdd499 100644
--- a/kernels/test/op_any_test.cpp
+++ b/kernels/test/op_any_test.cpp
@@ -23,54 +23,82 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_any_all_out(const Tensor& input, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::any_outf(context, input, out);
-}
-
-Tensor& op_any_dims_out(
-    const Tensor& input,
-    optional<ArrayRef<int64_t>> dim,
-    bool keepdim,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::any_outf(context, input, dim, keepdim, out);
-}
+class OpAnyOutTest : public OperatorTest {
+ protected:
+  Tensor& op_any_all_out(const Tensor& input, Tensor& out) {
+    return torch::executor::aten::any_outf(context_, input, out);
+  }
 
-Tensor&
-op_any_out(const Tensor& input, int64_t dim, bool keepdim, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::any_outf(context, input, dim, keepdim, out);
-}
+  Tensor& op_any_dims_out(
+      const Tensor& input,
+      optional<ArrayRef<int64_t>> dim,
+      bool keepdim,
+      Tensor& out) {
+    return torch::executor::aten::any_outf(context_, input, dim, keepdim, out);
+  }
 
-class OpAnyAllOutTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Since these tests cause ET_LOG to be called, the PAL must be initialized
-    // first.
-    torch::executor::runtime_init();
+  Tensor&
+  op_any_out(const Tensor& input, int64_t dim, bool keepdim, Tensor& out) {
+    return torch::executor::aten::any_outf(context_, input, dim, keepdim, out);
   }
-};
 
-class OpAnyDimsOutTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Since these tests cause ET_LOG to be called, the PAL must be initialized
-    // first.
-    torch::executor::runtime_init();
+  template <ScalarType OUT_DTYPE>
+  void test_any_all_out_invalid_type() {
+    TensorFactory<ScalarType::Float> tf_float;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    Tensor in = tf_float.make(
+        {1, 4},
+        {
+            0,
+            0,
+            1,
+            0,
+        });
+    Tensor out = tf_out.zeros(/*size=*/{0});
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_any_all_out(in, out));
   }
-};
 
-class OpAnyOutTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Since these tests cause ET_LOG to be called, the PAL must be initialized
-    // first.
-    torch::executor::runtime_init();
+  template <ScalarType IN_DTYPE>
+  void test_any_all_out() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<ScalarType::Bool> tf_bool;
+    // clang-format off
+    Tensor in = tf_in.make(
+      {2, 4},
+      {
+        0, 1, 0, 1,
+        1, 0, 1, 0
+      });
+    Tensor bool_false_in = tf_bool.make(
+      {2, 4},
+      {
+        false, false, false, false,
+        false, false, false, false,
+      });
+    Tensor bool_true_in = tf_bool.make(
+      {2, 4},
+      {
+        true, true, true, true,
+        true, true, true, true,
+      });
+    // clang-format on
+
+    Tensor out = tf_bool.make({}, {false});
+
+    op_any_all_out(in, out);
+    EXPECT_TENSOR_EQ(out, tf_bool.make({}, {true}));
+
+    op_any_all_out(bool_false_in, out);
+    EXPECT_TENSOR_EQ(out, tf_bool.make({}, {false}));
+
+    op_any_all_out(bool_true_in, out);
+    EXPECT_TENSOR_EQ(out, tf_bool.make({}, {true}));
   }
 };
 
-TEST_F(OpAnyAllOutTest, MismatchedDimensionsDies) {
+TEST_F(OpAnyOutTest, MismatchedDimensionsDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched dimensions";
   }
@@ -80,78 +108,23 @@ TEST_F(OpAnyAllOutTest, MismatchedDimensionsDies) {
   Tensor in = tff.make(size, {0, 0, 1, 0});
   Tensor out = tff.ones(/*size=*/{1, 1});
 
-  ET_EXPECT_KERNEL_FAILURE(op_any_all_out(in, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_any_all_out(in, out));
 }
 
-template <ScalarType OUT_DTYPE>
-void test_any_all_out_invalid_type() {
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  Tensor in = tf_float.make(
-      {1, 4},
-      {
-          0,
-          0,
-          1,
-          0,
-      });
-  Tensor out = tf_out.zeros(/*size=*/{0});
-
-  ET_EXPECT_KERNEL_FAILURE(op_any_all_out(in, out));
-}
-
-TEST_F(OpAnyAllOutTest, InvalidDtypeDies) {
+TEST_F(OpAnyOutTest, InvalidDtypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_any_all_out_invalid_type<ScalarType::dtype>();
   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-template <ScalarType IN_DTYPE>
-void test_any_all_out() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<ScalarType::Bool> tf_bool;
-  // clang-format off
-  Tensor in = tf_in.make(
-    {2, 4},
-    {
-      0, 1, 0, 1,
-      1, 0, 1, 0
-    });
-  Tensor bool_false_in = tf_bool.make(
-    {2, 4},
-    {
-      false, false, false, false,
-      false, false, false, false,
-    });
-  Tensor bool_true_in = tf_bool.make(
-    {2, 4},
-    {
-      true, true, true, true,
-      true, true, true, true,
-    });
-  // clang-format on
-
-  Tensor out = tf_bool.make({}, {false});
-
-  op_any_all_out(in, out);
-  EXPECT_TENSOR_EQ(out, tf_bool.make({}, {true}));
-
-  op_any_all_out(bool_false_in, out);
-  EXPECT_TENSOR_EQ(out, tf_bool.make({}, {false}));
-
-  op_any_all_out(bool_true_in, out);
-  EXPECT_TENSOR_EQ(out, tf_bool.make({}, {true}));
-}
-
-TEST_F(OpAnyAllOutTest, AllRealInputTypePasses) {
+TEST_F(OpAnyOutTest, AllRealInputTypePasses) {
 #define TEST_ENTRY(ctype, dtype) test_any_all_out<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST_F(OpAnyDimsOutTest, SmokeTest) {
+TEST_F(OpAnyOutTest, SmokeTestDims) {
   TensorFactory<ScalarType::Bool> tfBool;
 
   Tensor self = tfBool.make({2, 3, 1}, {true, false, true, true, false, false});
diff --git a/kernels/test/op_arange_test.cpp b/kernels/test/op_arange_test.cpp
index e355899f7a..7bacf93c74 100644
--- a/kernels/test/op_arange_test.cpp
+++ b/kernels/test/op_arange_test.cpp
@@ -27,65 +27,72 @@ using exec_aten::Tensor;
 
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_arange_out(const Scalar& end, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::arange_outf(context, end, out);
-}
+class OpArangeOutTest : public OperatorTest {
+ protected:
+  Tensor& op_arange_out(const Scalar& end, Tensor& out) {
+    return torch::executor::aten::arange_outf(context_, end, out);
+  }
 
-Tensor& op_arange_start_out(
-    const Scalar& start,
-    const Scalar& end,
-    const Scalar& step,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::arange_outf(context, start, end, step, out);
-}
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_arange_dtype() {
+    TensorFactory<DTYPE> tf;
 
-class OpArangeOutTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Since these tests cause ET_LOG to be called, the PAL must be initialized
-    // first.
-    torch::executor::runtime_init();
+    Scalar end = Scalar(static_cast<CTYPE>(10));
+
+    Tensor out = tf.zeros({10});
+
+    Tensor ret = op_arange_out(end, out);
+
+    // Should always return the provided out Tensor.
+    EXPECT_TENSOR_EQ(ret, out);
+
+    // Expected tensor, filled with 0, 1, ..., 9
+    Tensor expected = tf.make({10}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+    EXPECT_TENSOR_EQ(out, expected);
   }
 };
 
-class OpArangeStartOutTest : public ::testing::Test {
+class OpArangeStartOutTest : public OperatorTest {
  protected:
-  void SetUp() override {
-    // Since these tests cause ET_LOG to be called, the PAL must be initialized
-    // first.
-    torch::executor::runtime_init();
+  Tensor& op_arange_start_out(
+      const Scalar& start,
+      const Scalar& end,
+      const Scalar& step,
+      Tensor& out) {
+    return torch::executor::aten::arange_outf(context_, start, end, step, out);
   }
-};
 
-/// A generic smoke test that works for any dtype that supports  zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_arange_dtype() {
-  TensorFactory<DTYPE> tf;
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_arange_start_dtype() {
+    TensorFactory<DTYPE> tf;
 
-  Scalar end = Scalar(static_cast<CTYPE>(10));
+    Scalar start = Scalar(static_cast<CTYPE>(0));
+    Scalar end = Scalar(static_cast<CTYPE>(10));
+    Scalar step = Scalar(static_cast<CTYPE>(1));
 
-  Tensor out = tf.zeros({10});
+    Tensor out = tf.zeros({10});
 
-  Tensor ret = op_arange_out(end, out);
+    Tensor ret = op_arange_start_out(start, end, step, out);
 
-  // Should always return the provided out Tensor.
-  EXPECT_TENSOR_EQ(ret, out);
+    // Should always return the provided out Tensor.
+    EXPECT_TENSOR_EQ(ret, out);
 
-  // Expected tensor, filled with 0, 1, ..., 9
-  Tensor expected = tf.make({10}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    // Expected tensor, filled with 0, 1, ..., 9
+    Tensor expected = tf.make({10}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
 
-  EXPECT_TENSOR_EQ(out, expected);
-}
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
 
-TEST(OpArangeOutTest, AllRealDtypesSupported) {
+/// A generic smoke test that works for any dtype that supports  zeros().
+TEST_F(OpArangeOutTest, AllRealDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_arange_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpArangeOutTest, FloatNumberNotEqualIntSupport) {
+TEST_F(OpArangeOutTest, FloatNumberNotEqualIntSupport) {
   TensorFactory<ScalarType::Float> tf;
 
   // end = any floating point number between [a, a+1) where a is an arbitrary
@@ -106,7 +113,7 @@ TEST(OpArangeOutTest, FloatNumberNotEqualIntSupport) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpArangeOutTest, OutDimUnsupportedDie) {
+TEST_F(OpArangeOutTest, OutDimUnsupportedDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched out dim";
   }
@@ -117,10 +124,10 @@ TEST(OpArangeOutTest, OutDimUnsupportedDie) {
   Tensor out = tf.zeros({5, 1});
 
   // out.dim() should be 1, not 2
-  ET_EXPECT_KERNEL_FAILURE(op_arange_out(end, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_arange_out(end, out));
 }
 
-TEST(OpArangeOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpArangeOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor expected_result = tf.make({5}, {0, 1, 2, 3, 4});
@@ -131,7 +138,7 @@ TEST(OpArangeOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpArangeOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpArangeOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor expected_result = tf.make({5}, {0, 1, 2, 3, 4});
@@ -142,7 +149,7 @@ TEST(OpArangeOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpArangeOutTest, DynamicShapeUnbound) {
+TEST_F(OpArangeOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic Unbound not supported";
   }
@@ -157,35 +164,14 @@ TEST(OpArangeOutTest, DynamicShapeUnbound) {
 }
 
 /// A generic smoke test that works for any dtype that supports  zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_arange_start_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  Scalar start = Scalar(static_cast<CTYPE>(0));
-  Scalar end = Scalar(static_cast<CTYPE>(10));
-  Scalar step = Scalar(static_cast<CTYPE>(1));
-
-  Tensor out = tf.zeros({10});
-
-  Tensor ret = op_arange_start_out(start, end, step, out);
-
-  // Should always return the provided out Tensor.
-  EXPECT_TENSOR_EQ(ret, out);
-
-  // Expected tensor, filled with 0, 1, ..., 9
-  Tensor expected = tf.make({10}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpArangeStartOutTest, AllRealDtypesSupported) {
+TEST_F(OpArangeStartOutTest, AllRealDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) \
   test_arange_start_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpArangeStartOutTest, FloatNumberNotEqualIntSupport) {
+TEST_F(OpArangeStartOutTest, FloatNumberNotEqualIntSupport) {
   TensorFactory<ScalarType::Float> tf;
 
   // Tested in bento:
@@ -209,7 +195,7 @@ TEST(OpArangeStartOutTest, FloatNumberNotEqualIntSupport) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpArangeStartOutTest, OutDimUnsupportedDie) {
+TEST_F(OpArangeStartOutTest, OutDimUnsupportedDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched out dim";
   }
@@ -222,10 +208,11 @@ TEST(OpArangeStartOutTest, OutDimUnsupportedDie) {
   Tensor out = tf.zeros({5, 1});
 
   // out.dim() should be 1, not 2
-  ET_EXPECT_KERNEL_FAILURE(op_arange_start_out(start, end, step, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_arange_start_out(start, end, step, out));
 }
 
-TEST(OpArangeStartOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpArangeStartOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor expected_result = tf.make({5}, {0, 1, 2, 3, 4});
@@ -236,7 +223,7 @@ TEST(OpArangeStartOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpArangeStartOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpArangeStartOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor expected_result = tf.make({5}, {0, 1, 2, 3, 4});
@@ -247,7 +234,7 @@ TEST(OpArangeStartOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpArangeStartOutTest, DynamicShapeUnbound) {
+TEST_F(OpArangeStartOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic Unbound not supported";
   }
@@ -261,7 +248,7 @@ TEST(OpArangeStartOutTest, DynamicShapeUnbound) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpArangeStartOutTest, StartOut) {
+TEST_F(OpArangeStartOutTest, StartOut) {
   TensorFactory<ScalarType::Float> tf;
 
   Scalar start = Scalar(1.1);
@@ -294,7 +281,7 @@ TEST(OpArangeStartOutTest, StartOut) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpArangeStartOutTest, StartOutNegativeStep) {
+TEST_F(OpArangeStartOutTest, StartOutNegativeStep) {
   TensorFactory<ScalarType::Float> tf;
 
   Scalar start = Scalar(5.5);
diff --git a/kernels/test/op_argmax_test.cpp b/kernels/test/op_argmax_test.cpp
index 96ef45bf0d..51720ee9be 100644
--- a/kernels/test/op_argmax_test.cpp
+++ b/kernels/test/op_argmax_test.cpp
@@ -22,16 +22,18 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_argmax_out(
-    const Tensor& in,
-    optional<int64_t> dim,
-    bool keepdim,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::argmax_outf(context, in, dim, keepdim, out);
-}
-
-TEST(OpArgmaxTest, SanityCheckLong) {
+class OpArgmaxTest : public OperatorTest {
+ protected:
+  Tensor& op_argmax_out(
+      const Tensor& in,
+      optional<int64_t> dim,
+      bool keepdim,
+      Tensor& out) {
+    return torch::executor::aten::argmax_outf(context_, in, dim, keepdim, out);
+  }
+};
+
+TEST_F(OpArgmaxTest, SanityCheckLong) {
   TensorFactory<ScalarType::Long> tf;
 
   // clang-format off
@@ -56,7 +58,7 @@ TEST(OpArgmaxTest, SanityCheckLong) {
   // clang-format on
 }
 
-TEST(OpArgmaxTest, SanityCheckShort) {
+TEST_F(OpArgmaxTest, SanityCheckShort) {
   TensorFactory<ScalarType::Long> tfl;
   TensorFactory<ScalarType::Short> tfs;
 
@@ -82,7 +84,7 @@ TEST(OpArgmaxTest, SanityCheckShort) {
   // clang-format on
 }
 
-TEST(OpArgmaxTest, SanityCheckNullDim) {
+TEST_F(OpArgmaxTest, SanityCheckNullDim) {
   TensorFactory<ScalarType::Long> tf;
 
   // clang-format off
diff --git a/kernels/test/op_argmin_test.cpp b/kernels/test/op_argmin_test.cpp
index 5cba45a642..fd63e89ae6 100644
--- a/kernels/test/op_argmin_test.cpp
+++ b/kernels/test/op_argmin_test.cpp
@@ -22,16 +22,18 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_argmin_out(
-    const Tensor& in,
-    optional<int64_t> dim,
-    bool keepdim,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::argmin_outf(context, in, dim, keepdim, out);
-}
-
-TEST(OpArgminTest, SanityCheckLong) {
+class OpArgminTest : public OperatorTest {
+ protected:
+  Tensor& op_argmin_out(
+      const Tensor& in,
+      optional<int64_t> dim,
+      bool keepdim,
+      Tensor& out) {
+    return torch::executor::aten::argmin_outf(context_, in, dim, keepdim, out);
+  }
+};
+
+TEST_F(OpArgminTest, SanityCheckLong) {
   TensorFactory<ScalarType::Long> tf;
 
   // clang-format off
@@ -56,7 +58,7 @@ TEST(OpArgminTest, SanityCheckLong) {
   // clang-format on
 }
 
-TEST(OpArgminTest, SanityCheckShort) {
+TEST_F(OpArgminTest, SanityCheckShort) {
   TensorFactory<ScalarType::Long> tfl;
   TensorFactory<ScalarType::Short> tfs;
 
@@ -82,7 +84,7 @@ TEST(OpArgminTest, SanityCheckShort) {
   // clang-format on
 }
 
-TEST(OpArgminTest, SanityCheckNullDim) {
+TEST_F(OpArgminTest, SanityCheckNullDim) {
   TensorFactory<ScalarType::Long> tf;
 
   // clang-format off
diff --git a/kernels/test/op_as_strided_copy_test.cpp b/kernels/test/op_as_strided_copy_test.cpp
index e0e78be194..9d9fadb7db 100644
--- a/kernels/test/op_as_strided_copy_test.cpp
+++ b/kernels/test/op_as_strided_copy_test.cpp
@@ -25,166 +25,180 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_as_strided_copy_out(
-    const Tensor& self,
-    ArrayRef<int64_t> size,
-    ArrayRef<int64_t> stride,
-    optional<int64_t> storage_offset,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::as_strided_copy_outf(
-      context, self, size, stride, storage_offset, out);
-}
-
-// Common testing for eq operator
-template <ScalarType DTYPE>
-void test_detach_copy_out() {
-  TensorFactory<DTYPE> tf;
-  const std::vector<int32_t> in_sizes = {3, 3};
-  const std::vector<int32_t> out_sizes = {2, 2, 2};
-
-  Tensor in = tf.make(in_sizes, {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  Tensor out = tf.zeros(out_sizes);
-
-  // Valid input should give the expected output
-  optional<int64_t> storage_offset;
-  int64_t sizes[3] = {2, 2, 2};
-  int64_t stride[3] = {1, 2, 3};
-  op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{sizes, 3},
-      /*stride=*/ArrayRef<int64_t>{stride, 3},
-      storage_offset,
-      out);
-  EXPECT_TENSOR_EQ(out, tf.make(out_sizes, {1, 4, 3, 6, 2, 5, 4, 7}));
-
-  // With storage offset
-  op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{sizes, 3},
-      /*stride=*/ArrayRef<int64_t>{stride, 3},
-      /*storage_offset=*/2,
-      out);
-  EXPECT_TENSOR_EQ(out, tf.make(out_sizes, {3, 6, 5, 8, 4, 7, 6, 9}));
-}
-
-template <>
-void test_detach_copy_out<ScalarType::Bool>() {
-  TensorFactory<ScalarType::Bool> tf;
-  const std::vector<int32_t> in_sizes = {3, 3};
-  const std::vector<int32_t> out_sizes = {2, 2, 2};
-  Tensor in = tf.make(
-      in_sizes, {false, true, false, true, false, true, false, true, false});
-  Tensor out = tf.zeros(out_sizes);
+class OpAsStridedCopyOutTest : public OperatorTest {
+ protected:
+  Tensor& op_as_strided_copy_out(
+      const Tensor& self,
+      ArrayRef<int64_t> size,
+      ArrayRef<int64_t> stride,
+      optional<int64_t> storage_offset,
+      Tensor& out) {
+    return torch::executor::aten::as_strided_copy_outf(
+        context_, self, size, stride, storage_offset, out);
+  }
 
-  // Valid input should give the expected output
-  optional<int64_t> storage_offset = 2;
-  int64_t sizes[3] = {2, 2, 2};
-  int64_t stride[3] = {1, 2, 3};
-  op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{sizes, 3},
-      /*stride=*/ArrayRef<int64_t>{stride, 3},
-      storage_offset,
-      out);
-  EXPECT_TENSOR_EQ(
-      out,
-      tf.make(out_sizes, {false, true, false, true, true, false, true, false}));
-}
+  // Common testing for eq operator
+  template <ScalarType DTYPE>
+  void test_detach_copy_out() {
+    TensorFactory<DTYPE> tf;
+    const std::vector<int32_t> in_sizes = {3, 3};
+    const std::vector<int32_t> out_sizes = {2, 2, 2};
+
+    Tensor in = tf.make(in_sizes, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    Tensor out = tf.zeros(out_sizes);
+
+    // Valid input should give the expected output
+    optional<int64_t> storage_offset;
+    int64_t sizes[3] = {2, 2, 2};
+    int64_t stride[3] = {1, 2, 3};
+    op_as_strided_copy_out(
+        /*self=*/in,
+        /*size=*/ArrayRef<int64_t>{sizes, 3},
+        /*stride=*/ArrayRef<int64_t>{stride, 3},
+        storage_offset,
+        out);
+    EXPECT_TENSOR_EQ(out, tf.make(out_sizes, {1, 4, 3, 6, 2, 5, 4, 7}));
+
+    // With storage offset
+    op_as_strided_copy_out(
+        /*self=*/in,
+        /*size=*/ArrayRef<int64_t>{sizes, 3},
+        /*stride=*/ArrayRef<int64_t>{stride, 3},
+        /*storage_offset=*/2,
+        out);
+    EXPECT_TENSOR_EQ(out, tf.make(out_sizes, {3, 6, 5, 8, 4, 7, 6, 9}));
+  }
 
-template <>
-void test_detach_copy_out<ScalarType::Float>() {
-  TensorFactory<ScalarType::Float> tf;
-  const std::vector<int32_t> in_sizes = {3, 3};
-  const std::vector<int32_t> out_sizes = {2, 2, 2};
+  template <>
+  void test_detach_copy_out<ScalarType::Bool>() {
+    TensorFactory<ScalarType::Bool> tf;
+    const std::vector<int32_t> in_sizes = {3, 3};
+    const std::vector<int32_t> out_sizes = {2, 2, 2};
+    Tensor in = tf.make(
+        in_sizes, {false, true, false, true, false, true, false, true, false});
+    Tensor out = tf.zeros(out_sizes);
+
+    // Valid input should give the expected output
+    optional<int64_t> storage_offset = 2;
+    int64_t sizes[3] = {2, 2, 2};
+    int64_t stride[3] = {1, 2, 3};
+    op_as_strided_copy_out(
+        /*self=*/in,
+        /*size=*/ArrayRef<int64_t>{sizes, 3},
+        /*stride=*/ArrayRef<int64_t>{stride, 3},
+        storage_offset,
+        out);
+    EXPECT_TENSOR_EQ(
+        out,
+        tf.make(
+            out_sizes, {false, true, false, true, true, false, true, false}));
+  }
 
-  Tensor in = tf.make(
-      in_sizes, {3.14, 2.33, 42, INFINITY, -INFINITY, NAN, -3.14, -2.33, -42});
-  Tensor out = tf.zeros(out_sizes);
+  template <>
+  void test_detach_copy_out<ScalarType::Float>() {
+    TensorFactory<ScalarType::Float> tf;
+    const std::vector<int32_t> in_sizes = {3, 3};
+    const std::vector<int32_t> out_sizes = {2, 2, 2};
+
+    Tensor in = tf.make(
+        in_sizes,
+        {3.14, 2.33, 42, INFINITY, -INFINITY, NAN, -3.14, -2.33, -42});
+    Tensor out = tf.zeros(out_sizes);
+
+    // Valid input should give the expected output
+    optional<int64_t> storage_offset = 2;
+    int64_t sizes[3] = {2, 2, 2};
+    int64_t stride[3] = {1, 2, 3};
+    op_as_strided_copy_out(
+        /*self=*/in,
+        /*size=*/ArrayRef<int64_t>{sizes, 3},
+        /*stride=*/ArrayRef<int64_t>{stride, 3},
+        storage_offset,
+        out);
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf.make(
+            out_sizes,
+            {42.0, NAN, -INFINITY, 2.33, INFINITY, -3.14, NAN, -42.0}));
+  }
 
-  // Valid input should give the expected output
-  optional<int64_t> storage_offset = 2;
-  int64_t sizes[3] = {2, 2, 2};
-  int64_t stride[3] = {1, 2, 3};
-  op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{sizes, 3},
-      /*stride=*/ArrayRef<int64_t>{stride, 3},
-      storage_offset,
-      out);
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf.make(
-          out_sizes,
-          {42.0, NAN, -INFINITY, 2.33, INFINITY, -3.14, NAN, -42.0}));
-}
+  template <ScalarType DTYPE>
+  void test_as_strided_copy_out_invalid_parameters() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> in_sizes = {3, 3};
+    const std::vector<int32_t> out_sizes = {2, 2, 2};
+
+    Tensor in = tf.ones(in_sizes);
+    Tensor out = tf.zeros(out_sizes);
+    optional<int64_t> storage_offset;
+    int64_t sizes[3] = {2, 2, 2};
+    int64_t stride[3] = {1, 2, 3};
+
+    // Mismatch strides and shape should die
+    int64_t stride_short[2] = {1, 2};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_as_strided_copy_out(
+            /*self=*/in,
+            /*size=*/ArrayRef<int64_t>{sizes, 3},
+            /*stride=*/ArrayRef<int64_t>{stride_short, 2},
+            storage_offset,
+            out));
+
+    // Negative strides should die
+    int64_t stride_negative[3] = {1, 2, -1};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_as_strided_copy_out(
+            /*self=*/in,
+            /*size=*/ArrayRef<int64_t>{sizes, 3},
+            /*stride=*/ArrayRef<int64_t>{stride_negative, 3},
+            storage_offset,
+            out));
+
+    // Mismatch output tensor shape and size should die
+    int64_t size_invalid[3] = {2, 2, 1};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_as_strided_copy_out(
+            /*self=*/in,
+            /*size=*/ArrayRef<int64_t>{size_invalid, 3},
+            /*stride=*/ArrayRef<int64_t>{stride, 3},
+            storage_offset,
+            out));
+
+    // Invalid storage offset should die
+    storage_offset = -1;
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_as_strided_copy_out(
+            /*self=*/in,
+            /*size=*/ArrayRef<int64_t>{sizes, 3},
+            /*stride=*/ArrayRef<int64_t>{stride, 3},
+            storage_offset,
+            out));
+
+    // Out of bound storage access of `in` should die
+    storage_offset = 3;
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_as_strided_copy_out(
+            /*self=*/in,
+            /*size=*/ArrayRef<int64_t>{sizes, 3},
+            /*stride=*/ArrayRef<int64_t>{stride, 3},
+            storage_offset,
+            out));
+  }
+};
 
-TEST(OpAsStridedCopyOutKernelTest, AllScalarInputOutputSupport) {
+TEST_F(OpAsStridedCopyOutTest, AllScalarInputOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) test_detach_copy_out<ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-template <ScalarType DTYPE>
-void test_as_strided_copy_out_invalid_parameters() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> in_sizes = {3, 3};
-  const std::vector<int32_t> out_sizes = {2, 2, 2};
-
-  Tensor in = tf.ones(in_sizes);
-  Tensor out = tf.zeros(out_sizes);
-  optional<int64_t> storage_offset;
-  int64_t sizes[3] = {2, 2, 2};
-  int64_t stride[3] = {1, 2, 3};
-
-  // Mismatch strides and shape should die
-  int64_t stride_short[2] = {1, 2};
-  ET_EXPECT_KERNEL_FAILURE(op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{sizes, 3},
-      /*stride=*/ArrayRef<int64_t>{stride_short, 2},
-      storage_offset,
-      out));
-
-  // Negative strides should die
-  int64_t stride_negative[3] = {1, 2, -1};
-  ET_EXPECT_KERNEL_FAILURE(op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{sizes, 3},
-      /*stride=*/ArrayRef<int64_t>{stride_negative, 3},
-      storage_offset,
-      out));
-
-  // Mismatch output tensor shape and size should die
-  int64_t size_invalid[3] = {2, 2, 1};
-  ET_EXPECT_KERNEL_FAILURE(op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{size_invalid, 3},
-      /*stride=*/ArrayRef<int64_t>{stride, 3},
-      storage_offset,
-      out));
-
-  // Invalid storage offset should die
-  storage_offset = -1;
-  ET_EXPECT_KERNEL_FAILURE(op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{sizes, 3},
-      /*stride=*/ArrayRef<int64_t>{stride, 3},
-      storage_offset,
-      out));
-
-  // Out of bound storage access of `in` should die
-  storage_offset = 3;
-  ET_EXPECT_KERNEL_FAILURE(op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{sizes, 3},
-      /*stride=*/ArrayRef<int64_t>{stride, 3},
-      storage_offset,
-      out));
-}
-
-TEST(OpAsStridedCopyOutKernelTest, InvalidParametersDies) {
+TEST_F(OpAsStridedCopyOutTest, InvalidParametersDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle invalid parameter";
   }
@@ -194,7 +208,7 @@ TEST(OpAsStridedCopyOutKernelTest, InvalidParametersDies) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsStridedCopyOutKernelTest, MismatchedInputDtypesDies) {
+TEST_F(OpAsStridedCopyOutTest, MismatchedInputDtypesDies) {
   TensorFactory<ScalarType::Byte> tf_byte;
   TensorFactory<ScalarType::Char> tf_char;
   const std::vector<int32_t> in_sizes = {3, 3};
@@ -206,12 +220,14 @@ TEST(OpAsStridedCopyOutKernelTest, MismatchedInputDtypesDies) {
   int64_t sizes[3] = {2, 2, 2};
   int64_t stride[3] = {1, 2, 3};
 
-  ET_EXPECT_KERNEL_FAILURE(op_as_strided_copy_out(
-      /*self=*/in,
-      /*size=*/ArrayRef<int64_t>{sizes, 3},
-      /*stride=*/ArrayRef<int64_t>{stride, 3},
-      storage_offset,
-      out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_as_strided_copy_out(
+          /*self=*/in,
+          /*size=*/ArrayRef<int64_t>{sizes, 3},
+          /*stride=*/ArrayRef<int64_t>{stride, 3},
+          storage_offset,
+          out));
 }
 
 /* %python
@@ -229,7 +245,7 @@ opt_extra_params = "size, stride, storage_offset,"
 dtype = "ScalarType::Float"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpAsStridedCopyOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpAsStridedCopyOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{2, 2, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -270,7 +286,7 @@ TEST(OpAsStridedCopyOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpAsStridedCopyOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpAsStridedCopyOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   /* %python
   out_args = "{5, 5, 5}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -311,7 +327,7 @@ TEST(OpAsStridedCopyOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpAsStridedCopyOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpAsStridedCopyOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_asin_test.cpp b/kernels/test/op_asin_test.cpp
index 548868f9b8..ae0af71d2d 100644
--- a/kernels/test/op_asin_test.cpp
+++ b/kernels/test/op_asin_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_asin_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::asin_outf(context, self, out);
-}
+class OpAsinOutTest : public OperatorTest {
+ protected:
+  Tensor& op_asin_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::asin_outf(context_, self, out);
+  }
+
+  // Common testing for asin operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_asin_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the asin operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_asin_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 6}, { 0.000000, 1.570796, NAN, NAN, NAN, NAN }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_asin_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_asin_out(in, out));
+  }
+};
 
-TEST(OpAsinOutKernelTest, HandleBoolInput) {
+TEST_F(OpAsinOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpAsinOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_asin_out(a, out), res);
 }
 
-// Common testing for asin operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_asin_out(
-    const std::vector<int32_t>& out_shape = {1, 6},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the asin operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_asin_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 6}, { 0.000000, 1.570796, NAN, NAN, NAN, NAN }));
-  // clang-format on
-}
-
-TEST(OpAsinOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpAsinOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpAsinOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpAsinOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_asin_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpAsinOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_asin_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpAsinOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpAsinOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpAsinOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                      \
   test_floating_point_asin_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpAsinOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpAsinOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                       \
   test_floating_point_asin_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpAsinOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpAsinOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpAsinOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpAsinOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpAsinOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_asin_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_asin_out(in, out));
-}
-
-TEST(OpAsinOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpAsinOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_asin_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpAsinOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpAsinOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpAsinOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpAsinOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_asin_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_asin_out(a, out));
 }
diff --git a/kernels/test/op_asinh_test.cpp b/kernels/test/op_asinh_test.cpp
index 48a622e028..cd887404b7 100644
--- a/kernels/test/op_asinh_test.cpp
+++ b/kernels/test/op_asinh_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_asinh_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::asinh_outf(context, self, out);
-}
+class OpAsinhOutTest : public OperatorTest {
+ protected:
+  Tensor& op_asinh_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::asinh_outf(context_, self, out);
+  }
+
+  // Common testing for asinh operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_asinh_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the asinh operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_asinh_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 6}, { 0.000000, 0.881374, 1.818447, 2.312438, 2.998223, 5.298342 }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_asinh_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_asinh_out(in, out));
+  }
+};
 
-TEST(OpAsinhOutKernelTest, HandleBoolInput) {
+TEST_F(OpAsinhOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpAsinhOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_asinh_out(a, out), res);
 }
 
-// Common testing for asinh operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_asinh_out(
-    const std::vector<int32_t>& out_shape = {1, 6},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the asinh operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_asinh_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 6}, { 0.000000, 0.881374, 1.818447, 2.312438, 2.998223, 5.298342 }));
-  // clang-format on
-}
-
-TEST(OpAsinhOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpAsinhOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpAsinhOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinhOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpAsinhOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinhOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpAsinhOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinhOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpAsinhOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpAsinhOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinhOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpAsinhOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                       \
   test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpAsinhOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinhOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpAsinhOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                        \
   test_floating_point_asinh_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpAsinhOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinhOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpAsinhOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpAsinhOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAsinhOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpAsinhOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpAsinhOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_asinh_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_asinh_out(in, out));
-}
-
-TEST(OpAsinhOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpAsinhOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_asinh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpAsinhOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpAsinhOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpAsinhOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpAsinhOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_asinh_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_asinh_out(a, out));
 }
diff --git a/kernels/test/op_atan_test.cpp b/kernels/test/op_atan_test.cpp
index 821e52b9e1..6258819432 100644
--- a/kernels/test/op_atan_test.cpp
+++ b/kernels/test/op_atan_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_atan_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::atan_outf(context, self, out);
-}
+class OpAtanOutTest : public OperatorTest {
+ protected:
+  Tensor& op_atan_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::atan_outf(context_, self, out);
+  }
+
+  // Common testing for atan operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_atan_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the atan operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_atan_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 6}, { 0.000000, 0.785398, 1.249046, 1.373401, 1.471128, 1.560797 }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_atan_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_atan_out(in, out));
+  }
+};
 
-TEST(OpAtanOutKernelTest, HandleBoolInput) {
+TEST_F(OpAtanOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpAtanOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_atan_out(a, out), res);
 }
 
-// Common testing for atan operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_atan_out(
-    const std::vector<int32_t>& out_shape = {1, 6},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the atan operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_atan_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 6}, { 0.000000, 0.785398, 1.249046, 1.373401, 1.471128, 1.560797 }));
-  // clang-format on
-}
-
-TEST(OpAtanOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpAtanOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpAtanOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpAtanOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_atan_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpAtanOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_atan_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpAtanOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpAtanOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpAtanOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                      \
   test_floating_point_atan_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpAtanOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpAtanOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                       \
   test_floating_point_atan_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpAtanOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpAtanOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpAtanOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpAtanOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpAtanOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_atan_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_atan_out(in, out));
-}
-
-TEST(OpAtanOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpAtanOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_atan_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpAtanOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpAtanOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpAtanOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpAtanOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_atan_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_atan_out(a, out));
 }
diff --git a/kernels/test/op_atanh_test.cpp b/kernels/test/op_atanh_test.cpp
index 5d1156ca55..88f02603c8 100644
--- a/kernels/test/op_atanh_test.cpp
+++ b/kernels/test/op_atanh_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_atanh_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::atanh_outf(context, self, out);
-}
+class OpAtanhOutTest : public OperatorTest {
+ protected:
+  Tensor& op_atanh_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::atanh_outf(context_, self, out);
+  }
+
+  // Common testing for atanh operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_atanh_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the atanh operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_atanh_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 6}, { 0.0, std::numeric_limits<float>::infinity(), NAN, NAN, NAN, NAN }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_atanh_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_atanh_out(in, out));
+  }
+};
 
-TEST(OpAtanhOutKernelTest, HandleBoolInput) {
+TEST_F(OpAtanhOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpAtanhOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_atanh_out(a, out), res);
 }
 
-// Common testing for atanh operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_atanh_out(
-    const std::vector<int32_t>& out_shape = {1, 6},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the atanh operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_atanh_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 6}, { 0.0, std::numeric_limits<float>::infinity(), NAN, NAN, NAN, NAN }));
-  // clang-format on
-}
-
-TEST(OpAtanhOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpAtanhOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpAtanhOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanhOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpAtanhOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanhOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpAtanhOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanhOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpAtanhOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpAtanhOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanhOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpAtanhOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                       \
   test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpAtanhOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanhOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpAtanhOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                        \
   test_floating_point_atanh_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpAtanhOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanhOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpAtanhOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpAtanhOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpAtanhOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpAtanhOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpAtanhOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_atanh_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_atanh_out(in, out));
-}
-
-TEST(OpAtanhOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpAtanhOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_atanh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpAtanhOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpAtanhOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpAtanhOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpAtanhOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_atanh_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_atanh_out(a, out));
 }
diff --git a/kernels/test/op_avg_pool2d_test.cpp b/kernels/test/op_avg_pool2d_test.cpp
index 2bb3389ba1..90838330fa 100644
--- a/kernels/test/op_avg_pool2d_test.cpp
+++ b/kernels/test/op_avg_pool2d_test.cpp
@@ -17,29 +17,31 @@
 
 using namespace ::testing;
 
-exec_aten::Tensor& op_avg_pool2d_out(
-    const exec_aten::Tensor& self,
-    exec_aten::ArrayRef<int64_t> kernel_size,
-    exec_aten::ArrayRef<int64_t> stride,
-    exec_aten::ArrayRef<int64_t> padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    exec_aten::optional<int64_t> divisor_override,
-    exec_aten::Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::avg_pool2d_outf(
-      context,
-      self,
-      kernel_size,
-      stride,
-      padding,
-      ceil_mode,
-      count_include_pad,
-      divisor_override,
-      out);
-}
+class OpAvgPool2DOutTest : public OperatorTest {
+ protected:
+  exec_aten::Tensor& op_avg_pool2d_out(
+      const exec_aten::Tensor& self,
+      exec_aten::ArrayRef<int64_t> kernel_size,
+      exec_aten::ArrayRef<int64_t> stride,
+      exec_aten::ArrayRef<int64_t> padding,
+      bool ceil_mode,
+      bool count_include_pad,
+      exec_aten::optional<int64_t> divisor_override,
+      exec_aten::Tensor& out) {
+    return torch::executor::aten::avg_pool2d_outf(
+        context_,
+        self,
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        divisor_override,
+        out);
+  }
+};
 
-TEST(OpAvgPool2DOutTest, SanityCheck4D) {
+TEST_F(OpAvgPool2DOutTest, SanityCheck4D) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor self = tfFloat.make(
@@ -191,7 +193,7 @@ TEST(OpAvgPool2DOutTest, SanityCheck4D) {
   EXPECT_TENSOR_CLOSE(out, out_expected);
 }
 
-TEST(OpAvgPool2DOutTest, SanityCheck4DDivisorOverride) {
+TEST_F(OpAvgPool2DOutTest, SanityCheck4DDivisorOverride) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor self = tfFloat.make(
@@ -344,7 +346,7 @@ TEST(OpAvgPool2DOutTest, SanityCheck4DDivisorOverride) {
   EXPECT_TENSOR_CLOSE(out, out_expected);
 }
 
-TEST(OpAvgPool2DOutTest, SanityCheck4DCeilModeNoIncludePadding) {
+TEST_F(OpAvgPool2DOutTest, SanityCheck4DCeilModeNoIncludePadding) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor self = tfFloat.make(
diff --git a/kernels/test/op_bitwise_and_test.cpp b/kernels/test/op_bitwise_and_test.cpp
index 76c72a19ef..3bf1742bba 100644
--- a/kernels/test/op_bitwise_and_test.cpp
+++ b/kernels/test/op_bitwise_and_test.cpp
@@ -20,18 +20,19 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_bitwise_and_scalar_out(
-    const Tensor& self,
-    const Scalar& other,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::bitwise_and_outf(context, self, other, out);
-}
+class OpwiseBitwiseAndTest : public OperatorTest {
+ protected:
+  Tensor& op_bitwise_and_scalar_out(
+      const Tensor& self,
+      const Scalar& other,
+      Tensor& out) {
+    return torch::executor::aten::bitwise_and_outf(context_, self, other, out);
+  }
 
-Tensor& op_bitwise_and_tensor_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::bitwise_and_outf(context, self, other, out);
-}
+  Tensor& op_bitwise_and_tensor_out(
+      const Tensor& self,
+      const Tensor& other,
+      Tensor& out) {
+    return torch::executor::aten::bitwise_and_outf(context_, self, other, out);
+  }
+};
diff --git a/kernels/test/op_bitwise_not_test.cpp b/kernels/test/op_bitwise_not_test.cpp
index a58a5c87a1..dbbba87d09 100644
--- a/kernels/test/op_bitwise_not_test.cpp
+++ b/kernels/test/op_bitwise_not_test.cpp
@@ -21,66 +21,82 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_bitwise_not_out(const Tensor& a, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::bitwise_not_outf(context, a, out);
-}
+class OpBitwiseNotOutTest : public OperatorTest {
+ protected:
+  Tensor& op_bitwise_not_out(const Tensor& a, Tensor& out) {
+    return torch::executor::aten::bitwise_not_outf(context_, a, out);
+  }
 
-// Common testing for bitwise_not operator
-template <ScalarType DTYPE>
-void test_bitwise_not_out() {
-  TensorFactory<DTYPE> tf;
+  // Common testing for bitwise_not operator
+  template <ScalarType DTYPE>
+  void test_bitwise_not_out() {
+    TensorFactory<DTYPE> tf;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the bitwise_not operator.
-  Tensor out = tf.zeros(sizes);
+    // Destination for the bitwise_not operator.
+    Tensor out = tf.zeros(sizes);
 
-  // Check that it matches the expected output.
-  op_bitwise_not_out(tf.make(sizes, /*data=*/{0, -1, -2, 3}), out);
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{-1, 0, 1, -4}));
-}
+    // Check that it matches the expected output.
+    op_bitwise_not_out(tf.make(sizes, /*data=*/{0, -1, -2, 3}), out);
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{-1, 0, 1, -4}));
+  }
 
-template <>
-void test_bitwise_not_out<ScalarType::Byte>() {
-  TensorFactory<ScalarType::Byte> tf;
+  template <>
+  void test_bitwise_not_out<ScalarType::Byte>() {
+    TensorFactory<ScalarType::Byte> tf;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the bitwise_not operator.
-  Tensor out = tf.zeros(sizes);
+    // Destination for the bitwise_not operator.
+    Tensor out = tf.zeros(sizes);
 
-  // Check that it matches the expected output.
-  op_bitwise_not_out(tf.make(sizes, /*data=*/{0, 1, 2, 3}), out);
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{255, 254, 253, 252}));
-}
+    // Check that it matches the expected output.
+    op_bitwise_not_out(tf.make(sizes, /*data=*/{0, 1, 2, 3}), out);
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{255, 254, 253, 252}));
+  }
 
-template <>
-void test_bitwise_not_out<ScalarType::Bool>() {
-  TensorFactory<ScalarType::Bool> tf;
+  template <>
+  void test_bitwise_not_out<ScalarType::Bool>() {
+    TensorFactory<ScalarType::Bool> tf;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the bitwise_not operator.
-  Tensor out = tf.zeros(sizes);
+    // Destination for the bitwise_not operator.
+    Tensor out = tf.zeros(sizes);
 
-  // Check that it matches the expected output.
-  op_bitwise_not_out(tf.make(sizes, /*data=*/{true, false, true, false}), out);
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{false, true, false, true}));
-}
+    // Check that it matches the expected output.
+    op_bitwise_not_out(
+        tf.make(sizes, /*data=*/{true, false, true, false}), out);
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{false, true, false, true}));
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType DTYPE>
+  void test_bitwise_not_invalid_dtype_dies() {
+    TensorFactory<DTYPE> tf;
 
-TEST(OpBitwiseNotOutKernelTest, AllIntInputOutputSupport) {
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_bitwise_not_out(in, out));
+  }
+};
+
+TEST_F(OpBitwiseNotOutTest, AllIntInputOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) test_bitwise_not_out<ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpBitwiseNotOutKernelTest, BoolInputOutputSupport) {
+TEST_F(OpBitwiseNotOutTest, BoolInputOutputSupport) {
   test_bitwise_not_out<ScalarType::Bool>();
 }
 
 // Mismatched shape tests.
-TEST(OpBitwiseNotOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpBitwiseNotOutTest, MismatchedShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -89,23 +105,10 @@ TEST(OpBitwiseNotOutKernelTest, MismatchedShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_bitwise_not_out(a, out));
-}
-
-// Unhandled output dtypes.
-template <ScalarType DTYPE>
-void test_bitwise_not_invalid_dtype_dies() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_bitwise_not_out(in, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_bitwise_not_out(a, out));
 }
 
-TEST(OpBitwiseNotOutKernelTest, AllFloatInputDTypeDies) {
+TEST_F(OpBitwiseNotOutTest, AllFloatInputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_bitwise_not_invalid_dtype_dies<ScalarType::dtype>();
   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
@@ -121,7 +124,7 @@ op = "op_bitwise_not_out"
 dtype = "ScalarType::Int"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpBitwiseNotOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpBitwiseNotOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -137,7 +140,7 @@ TEST(OpBitwiseNotOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpBitwiseNotOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpBitwiseNotOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   /* %python
   out_args = "{10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -153,7 +156,7 @@ TEST(OpBitwiseNotOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpBitwiseNotOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpBitwiseNotOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   /* %python
   out_args = "{1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND"
diff --git a/kernels/test/op_bitwise_or_test.cpp b/kernels/test/op_bitwise_or_test.cpp
index e425f6987f..7d84d16318 100644
--- a/kernels/test/op_bitwise_or_test.cpp
+++ b/kernels/test/op_bitwise_or_test.cpp
@@ -20,14 +20,19 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_bitwise_or_scalar_out(const Tensor& self, const Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::bitwise_or_outf(context, self, other, out);
-}
+class OpBitwiseOrTest : public OperatorTest {
+ protected:
+  Tensor& op_bitwise_or_scalar_out(
+      const Tensor& self,
+      const Scalar& other,
+      Tensor& out) {
+    return torch::executor::aten::bitwise_or_outf(context_, self, other, out);
+  }
 
-Tensor&
-op_bitwise_or_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::bitwise_or_outf(context, self, other, out);
-}
+  Tensor& op_bitwise_or_tensor_out(
+      const Tensor& self,
+      const Tensor& other,
+      Tensor& out) {
+    return torch::executor::aten::bitwise_or_outf(context_, self, other, out);
+  }
+};
diff --git a/kernels/test/op_bitwise_xor_test.cpp b/kernels/test/op_bitwise_xor_test.cpp
index 1a2bca88bb..234fd33850 100644
--- a/kernels/test/op_bitwise_xor_test.cpp
+++ b/kernels/test/op_bitwise_xor_test.cpp
@@ -20,18 +20,19 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_bitwise_xor_scalar_out(
-    const Tensor& self,
-    const Scalar& other,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::bitwise_xor_outf(context, self, other, out);
-}
+class OpBitwiseOrTest : public OperatorTest {
+ protected:
+  Tensor& op_bitwise_xor_scalar_out(
+      const Tensor& self,
+      const Scalar& other,
+      Tensor& out) {
+    return torch::executor::aten::bitwise_xor_outf(context_, self, other, out);
+  }
 
-Tensor& op_bitwise_xor_tensor_out(
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::bitwise_xor_outf(context, self, other, out);
-}
+  Tensor& op_bitwise_xor_tensor_out(
+      const Tensor& self,
+      const Tensor& other,
+      Tensor& out) {
+    return torch::executor::aten::bitwise_xor_outf(context_, self, other, out);
+  }
+};
diff --git a/kernels/test/op_bmm_test.cpp b/kernels/test/op_bmm_test.cpp
index e784a3008d..ae60cba66b 100644
--- a/kernels/test/op_bmm_test.cpp
+++ b/kernels/test/op_bmm_test.cpp
@@ -22,12 +22,30 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_bmm_out(const Tensor& self, const Tensor& mat2, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::bmm_outf(context, self, mat2, out);
-}
+class OpBmmOutTest : public OperatorTest {
+ protected:
+  Tensor& op_bmm_out(const Tensor& self, const Tensor& mat2, Tensor& out) {
+    return torch::executor::aten::bmm_outf(context_, self, mat2, out);
+  }
+
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    // Gives 4 * 2 * 3 = 24, shape (10, 3, 5)
+    Tensor x = tf.full({10, 3, 4}, 2);
+    Tensor y = tf.full({10, 4, 5}, 3);
+
+    Tensor out = tf.zeros({10, 3, 5});
+    op_bmm_out(x, y, out);
 
-TEST(OpBmmOutTest, OutputDim) {
+    Tensor expected = tf.full({10, 3, 5}, 24);
+
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpBmmOutTest, OutputDim) {
   TensorFactory<ScalarType::Int> tf;
 
   // Two tensors with compatible dimensions: (10, 3, 4) and (10, 4, 5).
@@ -48,7 +66,7 @@ TEST(OpBmmOutTest, OutputDim) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpBmmOutTest, OutputDimFloat) {
+TEST_F(OpBmmOutTest, OutputDimFloat) {
   TensorFactory<ScalarType::Float> tf;
 
   // clang-format off
@@ -114,23 +132,7 @@ TEST(OpBmmOutTest, OutputDimFloat) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  // Gives 4 * 2 * 3 = 24, shape (10, 3, 5)
-  Tensor x = tf.full({10, 3, 4}, 2);
-  Tensor y = tf.full({10, 4, 5}, 3);
-
-  Tensor out = tf.zeros({10, 3, 5});
-  op_bmm_out(x, y, out);
-
-  Tensor expected = tf.full({10, 3, 5}, 24);
-
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpBmmOutTest, AllDtypesSupported) {
+TEST_F(OpBmmOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
@@ -139,7 +141,7 @@ TEST(OpBmmOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpBmmOutTest, EmptyInputWithEmptyOutTensorPasses) {
+TEST_F(OpBmmOutTest, EmptyInputWithEmptyOutTensorPasses) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.full({2, 2, 2}, 3);
@@ -155,7 +157,7 @@ TEST(OpBmmOutTest, EmptyInputWithEmptyOutTensorPasses) {
   EXPECT_EQ(out.numel(), 0);
 }
 
-TEST(OpBmmOutTest, MismatchedDimensionsDies) {
+TEST_F(OpBmmOutTest, MismatchedDimensionsDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.ones({2, 10, 3});
@@ -166,12 +168,12 @@ TEST(OpBmmOutTest, MismatchedDimensionsDies) {
 
   Tensor out = tf.ones({2, 10, 4});
 
-  ET_EXPECT_KERNEL_FAILURE(op_bmm_out(x, wrong_y, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_bmm_out(x, wrong_y, out));
 
   EXPECT_TENSOR_EQ(op_bmm_out(x, right_y, out), tf.full({2, 10, 4}, 3));
 }
 
-TEST(OpBmmOutTest, MismatchedDimensionSizeDies) {
+TEST_F(OpBmmOutTest, MismatchedDimensionSizeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched dimension size";
   }
@@ -189,11 +191,11 @@ TEST(OpBmmOutTest, MismatchedDimensionSizeDies) {
   Tensor right_out = tf.ones({2, 10, 4});
   Tensor wrong_out = tf.ones({7, 5});
 
-  ET_EXPECT_KERNEL_FAILURE(op_bmm_out(x, right_y, wrong_out));
-  ET_EXPECT_KERNEL_FAILURE(op_bmm_out(x, wrong_y, right_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_bmm_out(x, right_y, wrong_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_bmm_out(x, wrong_y, right_out));
 }
 
-TEST(OpBmmOutTest, WrongOutShapeDies) {
+TEST_F(OpBmmOutTest, WrongOutShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle wrong out shape";
   }
@@ -207,12 +209,12 @@ TEST(OpBmmOutTest, WrongOutShapeDies) {
   Tensor right_out = tf.ones({2, 10, 4});
   Tensor wrong_out = tf.ones({3, 7, 5});
 
-  ET_EXPECT_KERNEL_FAILURE(op_bmm_out(x, y, wrong_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_bmm_out(x, y, wrong_out));
 
   EXPECT_TENSOR_EQ(op_bmm_out(x, y, right_out), tf.full({2, 10, 4}, 3));
 }
 
-TEST(OpBmmOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpBmmOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
@@ -276,7 +278,7 @@ TEST(OpBmmOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpBmmOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpBmmOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
@@ -340,7 +342,7 @@ TEST(OpBmmOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpBmmOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpBmmOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_cat_test.cpp b/kernels/test/op_cat_test.cpp
index 19dd822e43..04d649b98f 100644
--- a/kernels/test/op_cat_test.cpp
+++ b/kernels/test/op_cat_test.cpp
@@ -23,12 +23,39 @@ using exec_aten::Tensor;
 using exec_aten::TensorList;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_cat_out(TensorList tensors, int64_t dim, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::cat_outf(context, tensors, dim, out);
-}
+class OpCatOutTest : public OperatorTest {
+ protected:
+  Tensor& op_cat_out(TensorList tensors, int64_t dim, Tensor& out) {
+    return torch::executor::aten::cat_outf(context_, tensors, dim, out);
+  }
 
-TEST(OpCatOutTest, SmokeDim1) {
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    // Will be concatenated along dim[1]. Use different input values so we can
+    // see where each output value came from.
+    Tensor x = tf.ones({2, 1});
+    Tensor y = tf.zeros({2, 1});
+    std::vector<Tensor> inputs = {x, y};
+
+    Tensor out = tf.ones({2, 2});
+    op_cat_out(ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/1, out);
+
+    // clang-format off
+    Tensor expected = tf.make(
+        {2, 2},
+        {
+            1, 0,
+            1, 0,
+        });
+    // clang-format on
+
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpCatOutTest, SmokeDim1) {
   TensorFactory<ScalarType::Int> tf;
 
   // Two tensors with the same number of dimensions and the same dim[0]
@@ -78,7 +105,7 @@ TEST(OpCatOutTest, SmokeDim1) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpCatOutTest, HalfSupport) {
+TEST_F(OpCatOutTest, HalfSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -100,7 +127,7 @@ TEST(OpCatOutTest, HalfSupport) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpCatOutTest, NegativeDims) {
+TEST_F(OpCatOutTest, NegativeDims) {
   TensorFactory<ScalarType::Int> tf;
 
   // Symmetrical input tensors can can be concatenated along any dimension.
@@ -144,32 +171,7 @@ TEST(OpCatOutTest, NegativeDims) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  // Will be concatenated along dim[1]. Use different input values so we can see
-  // where each output value came from.
-  Tensor x = tf.ones({2, 1});
-  Tensor y = tf.zeros({2, 1});
-  std::vector<Tensor> inputs = {x, y};
-
-  Tensor out = tf.ones({2, 2});
-  op_cat_out(ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/1, out);
-
-  // clang-format off
-  Tensor expected = tf.make(
-      {2, 2},
-      {
-          1, 0,
-          1, 0,
-      });
-  // clang-format on
-
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpCatOutTest, AllDtypesSupported) {
+TEST_F(OpCatOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
@@ -178,7 +180,7 @@ TEST(OpCatOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpCatOutTest, EmptyInputTensorShapeIgnored) {
+TEST_F(OpCatOutTest, EmptyInputTensorShapeIgnored) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel doesn't ignore empty input tensor shape";
   }
@@ -199,7 +201,7 @@ TEST(OpCatOutTest, EmptyInputTensorShapeIgnored) {
   // Success if it doesn't assert on the weird-shaped empty input.
 }
 
-TEST(OpCatOutTest, DimBounds) {
+TEST_F(OpCatOutTest, DimBounds) {
   TensorFactory<ScalarType::Int> tf;
 
   // Cat a single tensor, which can be done across any dimension and still
@@ -221,20 +223,21 @@ TEST(OpCatOutTest, DimBounds) {
   // Some invalid dim values.
   const std::vector<int64_t> invalid_dims = {2, -3};
   for (int64_t dim : invalid_dims) {
-    ET_EXPECT_KERNEL_FAILURE(op_cat_out(inputs, dim, out));
+    ET_EXPECT_KERNEL_FAILURE(context_, op_cat_out(inputs, dim, out));
   }
 }
 
-TEST(OpCatOutTest, NoInputTensorsWithNonEmptyOutputDies) {
+TEST_F(OpCatOutTest, NoInputTensorsWithNonEmptyOutputDies) {
   TensorFactory<ScalarType::Int> tf;
   Tensor out = tf.ones({1});
 
   // Providing an empty list of input tensors should
   // cause an assertion and kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_cat_out(ArrayRef<Tensor>(), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_cat_out(ArrayRef<Tensor>(), /*dim=*/0, out));
 }
 
-TEST(OpCatOutTest, NoInputTensorsWithEmptyOutputDies) {
+TEST_F(OpCatOutTest, NoInputTensorsWithEmptyOutputDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Make an empty out tensor and demonstrate that it's empty.
@@ -243,10 +246,11 @@ TEST(OpCatOutTest, NoInputTensorsWithEmptyOutputDies) {
 
   // Providing an empty list of input tensors should
   // cause an assertion and kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_cat_out(ArrayRef<Tensor>(), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_cat_out(ArrayRef<Tensor>(), /*dim=*/0, out));
 }
 
-TEST(OpCatOutTest, MismatchedDtypesDies) {
+TEST_F(OpCatOutTest, MismatchedDtypesDies) {
   TensorFactory<ScalarType::Int> tf_int;
   TensorFactory<ScalarType::Float> tf_float;
   Tensor out = tf_int.zeros({4, 2});
@@ -254,11 +258,13 @@ TEST(OpCatOutTest, MismatchedDtypesDies) {
   // Same shape as the output, but a different dtype.
   std::vector<Tensor> inputs = {tf_float.ones({2, 2})};
 
-  ET_EXPECT_KERNEL_FAILURE(op_cat_out(
-      ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_cat_out(
+          ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
 }
 
-TEST(OpCatOutTest, MismatchedDimensionsDies) {
+TEST_F(OpCatOutTest, MismatchedDimensionsDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched dimensions";
   }
@@ -268,11 +274,13 @@ TEST(OpCatOutTest, MismatchedDimensionsDies) {
   // Same dtype and numel as the output, but a different number of dimensions.
   std::vector<Tensor> inputs = {tf.ones({1, 1, 1, 1})};
 
-  ET_EXPECT_KERNEL_FAILURE(op_cat_out(
-      ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_cat_out(
+          ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
 }
 
-TEST(OpCatOutTest, MismatchedDimensionSizeDies) {
+TEST_F(OpCatOutTest, MismatchedDimensionSizeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched dimension size";
   }
@@ -283,11 +291,13 @@ TEST(OpCatOutTest, MismatchedDimensionSizeDies) {
   // dimension.
   std::vector<Tensor> inputs = {tf.ones({2, 3})};
 
-  ET_EXPECT_KERNEL_FAILURE(op_cat_out(
-      ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_cat_out(
+          ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
 }
 
-TEST(OpCatOutTest, WrongOutShapeDies) {
+TEST_F(OpCatOutTest, WrongOutShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle wrong out shape";
   }
@@ -301,8 +311,10 @@ TEST(OpCatOutTest, WrongOutShapeDies) {
       tf.ones({2, 3}),
   };
 
-  ET_EXPECT_KERNEL_FAILURE(op_cat_out(
-      ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_cat_out(
+          ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
 }
 
 /* %python
@@ -318,7 +330,7 @@ opt_extra_params = "0,"
 dtype = "ScalarType::Int"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpCatOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpCatOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{8, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op_tensor_list_in) */
@@ -340,7 +352,7 @@ TEST(OpCatOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpCatOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpCatOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   /* %python
   out_args = "{10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op_tensor_list_in) */
@@ -362,7 +374,7 @@ TEST(OpCatOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpCatOutTest, DynamicShapeUnbound) {
+TEST_F(OpCatOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_ceil_test.cpp b/kernels/test/op_ceil_test.cpp
index 7bd7162b87..54b7787bd4 100644
--- a/kernels/test/op_ceil_test.cpp
+++ b/kernels/test/op_ceil_test.cpp
@@ -20,12 +20,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_ceil_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::ceil_outf(context, self, out);
-}
+class OpCeilTest : public OperatorTest {
+ protected:
+  Tensor& op_ceil_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::ceil_outf(context_, self, out);
+  }
+};
 
-TEST(OpCeilTest, SanityCheck) {
+TEST_F(OpCeilTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor in = tf.make({1, 7}, {-3.0, -2.99, -1.01, 0.0, 1.01, 2.99, 3.0});
@@ -38,7 +40,7 @@ TEST(OpCeilTest, SanityCheck) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpCeilTest, HalfSupport) {
+TEST_F(OpCeilTest, HalfSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp
index 3ad6574a87..871333482c 100644
--- a/kernels/test/op_clamp_test.cpp
+++ b/kernels/test/op_clamp_test.cpp
@@ -31,230 +31,261 @@ using torch::executor::testing::TensorFactory;
 
 using OptScalar = exec_aten::optional<Scalar>;
 
-Tensor& op_clamp_out(
-    const Tensor& self,
-    const optional<Scalar>& min,
-    const optional<Scalar>& max,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::clamp_outf(context, self, min, max, out);
-}
+class OpClampOutTest : public OperatorTest {
+ protected:
+  Tensor& op_clamp_out(
+      const Tensor& self,
+      const optional<Scalar>& min,
+      const optional<Scalar>& max,
+      Tensor& out) {
+    return torch::executor::aten::clamp_outf(context_, self, min, max, out);
+  }
 
-Tensor& op_clamp_tensor_out(
-    const Tensor& self,
-    const optional<Tensor>& min,
-    const optional<Tensor>& max,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::clamp_outf(context, self, min, max, out);
-}
+  template <ScalarType DTYPE>
+  struct ClampTestCase {
+    using ctype = typename TensorFactory<DTYPE>::ctype;
+
+    // Human-readable, unique title for the test case. Printed if the test
+    // fails.
+    const std::string title;
+    // Size vector for the input/output tensors.
+    const std::vector<int32_t> sizes;
+    // Data for the input tensor; must agree with `sizes`.
+    const std::vector<ctype> input_data;
+    // The (optional) min value to clamp to. Can be of any Scalar type.
+    const OptScalar min;
+    // The (optional) max value to clamp to. Can be of any Scalar type.
+    const OptScalar max;
+    // The expected output data when clamping `input_data` to `min`/`max`.
+    const std::vector<ctype> expected_data;
+  };
 
-/// Describes a test case, using tensors of the specified DTYPE.
-template <ScalarType DTYPE>
-struct ClampTestCase {
-  using ctype = typename TensorFactory<DTYPE>::ctype;
-
-  // Human-readable, unique title for the test case. Printed if the test fails.
-  const std::string title;
-  // Size vector for the input/output tensors.
-  const std::vector<int32_t> sizes;
-  // Data for the input tensor; must agree with `sizes`.
-  const std::vector<ctype> input_data;
-  // The (optional) min value to clamp to. Can be of any Scalar type.
-  const OptScalar min;
-  // The (optional) max value to clamp to. Can be of any Scalar type.
-  const OptScalar max;
-  // The expected output data when clamping `input_data` to `min`/`max`.
-  const std::vector<ctype> expected_data;
-};
+  /// Runs the provided test cases.
+  template <ScalarType DTYPE>
+  void run_test_cases(std::vector<ClampTestCase<DTYPE>> test_cases) {
+    TensorFactory<DTYPE> tf;
+    for (const auto& test_case : test_cases) {
+      SCOPED_TRACE(test_case.title); // Printed if the test fails
+
+      Tensor in = tf.make(test_case.sizes, test_case.input_data);
+      Tensor out = tf.zeros(test_case.sizes);
+      Tensor ret = op_clamp_out(in, test_case.min, test_case.max, out);
+      EXPECT_TENSOR_EQ(out, ret);
+
+      Tensor expected = tf.make(test_case.sizes, test_case.expected_data);
+      ET_CHECK_SAME_SHAPE_AND_DTYPE2(out, expected);
+      EXPECT_TENSOR_EQ(out, expected);
+    }
+  }
 
-/// Runs the provided test cases.
-template <ScalarType DTYPE>
-void run_test_cases(std::vector<ClampTestCase<DTYPE>> test_cases) {
-  TensorFactory<DTYPE> tf;
-  for (const auto& test_case : test_cases) {
-    SCOPED_TRACE(test_case.title); // Printed if the test fails
-
-    Tensor in = tf.make(test_case.sizes, test_case.input_data);
-    Tensor out = tf.zeros(test_case.sizes);
-    Tensor ret = op_clamp_out(in, test_case.min, test_case.max, out);
-    EXPECT_TENSOR_EQ(out, ret);
-
-    Tensor expected = tf.make(test_case.sizes, test_case.expected_data);
-    ET_CHECK_SAME_SHAPE_AND_DTYPE2(out, expected);
-    EXPECT_TENSOR_EQ(out, expected);
+  template <ScalarType DTYPE>
+  void run_unsigned_integer_test_cases() {
+    const std::vector<ClampTestCase<DTYPE>> test_cases = {
+        {
+            std::string(__func__) + ": Simple clamp",
+            {2, 2}, // sizes
+            {0, 1, 10, 100}, // input_data
+            OptScalar(1), // min
+            OptScalar(6), // max
+            {1, 1, 6, 6}, // expected_data
+        },
+        {
+            std::string(__func__) + ": No max",
+            {2, 2}, // sizes
+            {0, 1, 10, 100}, // input_data
+            OptScalar(1), // min
+            nullopt, // max
+            {1, 1, 10, 100}, // expected_data
+        },
+        {
+            std::string(__func__) + ": No min",
+            {2, 2}, // sizes
+            {0, 1, 10, 100}, // input_data
+            nullopt, // min
+            OptScalar(6), // max
+            {0, 1, 6, 6}, // expected_data
+        },
+        {
+            std::string(__func__) + ": min > max",
+            {2, 2}, // sizes
+            {0, 1, 10, 100}, // input_data
+            OptScalar(10), // min
+            OptScalar(6), // max
+            // Should set all elements to max.
+            {6, 6, 6, 6}, // expected_data
+        },
+    };
+
+    run_test_cases(test_cases);
   }
-}
 
-// Runs test cases that are compatible with uint8_t, and thus all other real
-// types. Cover the most cases here, since it's compatible with the most types.
-template <ScalarType DTYPE>
-void run_unsigned_integer_test_cases() {
-  const std::vector<ClampTestCase<DTYPE>> test_cases = {
-      {
-          std::string(__func__) + ": Simple clamp",
-          {2, 2}, // sizes
-          {0, 1, 10, 100}, // input_data
-          OptScalar(1), // min
-          OptScalar(6), // max
-          {1, 1, 6, 6}, // expected_data
-      },
-      {
-          std::string(__func__) + ": No max",
-          {2, 2}, // sizes
-          {0, 1, 10, 100}, // input_data
-          OptScalar(1), // min
-          nullopt, // max
-          {1, 1, 10, 100}, // expected_data
-      },
-      {
-          std::string(__func__) + ": No min",
-          {2, 2}, // sizes
-          {0, 1, 10, 100}, // input_data
-          nullopt, // min
-          OptScalar(6), // max
-          {0, 1, 6, 6}, // expected_data
-      },
-      {
-          std::string(__func__) + ": min > max",
-          {2, 2}, // sizes
-          {0, 1, 10, 100}, // input_data
-          OptScalar(10), // min
-          OptScalar(6), // max
-          // Should set all elements to max.
-          {6, 6, 6, 6}, // expected_data
-      },
-  };
+  // types.
+  template <ScalarType DTYPE>
+  void run_signed_integer_test_cases() {
+    std::vector<ClampTestCase<DTYPE>> test_cases = {
+        {
+            std::string(__func__) + ": Simple negative/positive clamp",
+            {2, 2}, // sizes
+            {-10, -1, 1, 10}, // input_data
+            OptScalar(-5), // min
+            OptScalar(5), // max
+            {-5, -1, 1, 5}, // expected_data
+        },
+        {
+            std::string(__func__) + ": Simple negative-only clamp",
+            {2, 2}, // sizes
+            {-10, -5, 1, 10}, // input_data
+            OptScalar(-6), // min
+            OptScalar(-1), // max
+            {-6, -5, -1, -1}, // expected_data
+        },
+    };
+
+    run_test_cases(test_cases);
+  }
 
-  run_test_cases(test_cases);
-}
+  // Test cases that are compatible with float and double.
+  template <ScalarType DTYPE>
+  void run_floating_point_test_cases() {
+    constexpr auto kInfinity =
+        std::numeric_limits<typename TensorFactory<DTYPE>::ctype>::infinity();
+    std::vector<ClampTestCase<DTYPE>> test_cases = {
+        {
+            std::string(__func__) + ": Simple negative/positive clamp",
+            {2, 2}, // sizes
+            {-10.1, -1.1, 1.1, 10.1}, // input_data
+            OptScalar(-5.5), // min
+            OptScalar(5.5), // max
+            {-5.5, -1.1, 1.1, 5.5}, // expected_data
+        },
+        {
+            std::string(__func__) + ": Simple negative-only clamp",
+            {2, 2}, // sizes
+            {-10.1, -5.5, 1.1, 10.1}, // input_data
+            OptScalar(-6.6), // min
+            OptScalar(-1.1), // max
+            {-6.6, -5.5, -1.1, -1.1}, // expected_data
+        },
+        {
+            std::string(__func__) + ": Infinities are clamped",
+            {2, 2}, // sizes
+            {-kInfinity, -1.1, 1.1, kInfinity}, // input_data
+            OptScalar(-5.5), // min
+            OptScalar(5.5), // max
+            {-5.5, -1.1, 1.1, 5.5}, // expected_data
+        },
+        {
+            std::string(__func__) + ": Infinite min",
+            {2, 2}, // sizes
+            {-10.1, -1.1, 1.1, 10.1}, // input_data
+            OptScalar(-kInfinity), // min
+            OptScalar(5.5), // max
+            {-10.1, -1.1, 1.1, 5.5}, // expected_data
+        },
+        {
+            std::string(__func__) + ": Infinite max",
+            {2, 2}, // sizes
+            {-10.1, -1.1, 1.1, 10.1}, // input_data
+            OptScalar(-5.5), // min
+            OptScalar(kInfinity), // max
+            {-5.5, -1.1, 1.1, 10.1}, // expected_data
+        },
+        {
+            std::string(__func__) + ": NaN entries preserved",
+            {2, 2}, // sizes
+            {-10.1, NAN, NAN, 10.1}, // input_data
+            OptScalar(0.0), // min
+            OptScalar(0.0), // max
+            {0.0, NAN, NAN, 0.0}, // expected_data
+        },
+        {
+            std::string(__func__) + ": NaN min produces all NaN output",
+            {2, 2}, // sizes
+            {-10.1, -1.1, 1.1, 10.1}, // input_data
+            OptScalar(NAN), // min
+            OptScalar(5.5), // max
+            {NAN, NAN, NAN, NAN}, // expected_data
+        },
+        {
+            std::string(__func__) + ": NaN max produces all NaN output",
+            {2, 2}, // sizes
+            {-10.1, -1.1, 1.1, 10.1}, // input_data
+            OptScalar(-5.5), // min
+            OptScalar(NAN), // max
+            {NAN, NAN, NAN, NAN}, // expected_data
+        },
+    };
+
+    run_test_cases(test_cases);
+  }
 
-// Runs test cases that are compatible with int8_t, and thus all signed real
-// types.
-template <ScalarType DTYPE>
-void run_signed_integer_test_cases() {
-  std::vector<ClampTestCase<DTYPE>> test_cases = {
-      {
-          std::string(__func__) + ": Simple negative/positive clamp",
-          {2, 2}, // sizes
-          {-10, -1, 1, 10}, // input_data
-          OptScalar(-5), // min
-          OptScalar(5), // max
-          {-5, -1, 1, 5}, // expected_data
-      },
-      {
-          std::string(__func__) + ": Simple negative-only clamp",
-          {2, 2}, // sizes
-          {-10, -5, 1, 10}, // input_data
-          OptScalar(-6), // min
-          OptScalar(-1), // max
-          {-6, -5, -1, -1}, // expected_data
-      },
-  };
+  // Tries clamping a DTYPE tensor to the provided value and expects it to die.
+  template <ScalarType DTYPE>
+  void expect_bad_clamp_value_dies(Scalar bad_value) {
+    TensorFactory<DTYPE> tf;
+    Tensor in = tf.ones({2, 2});
+    Tensor out = tf.zeros({2, 2});
+
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_clamp_out(in, /*min=*/bad_value, /*max=*/nullopt, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_clamp_out(in, /*min=*/nullopt, /*max=*/bad_value, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_clamp_out(in, /*min=*/bad_value, /*max=*/bad_value, out));
+  }
 
-  run_test_cases(test_cases);
-}
+  // One of min and max should be non-null
+  void expect_both_min_max_null_die() {
+    TensorFactory<ScalarType::Float> tf;
+    Tensor in = tf.ones({2, 2});
+    Tensor out = tf.zeros({2, 2});
 
-// Test cases that are compatible with float and double.
-template <ScalarType DTYPE>
-void run_floating_point_test_cases() {
-  constexpr auto kInfinity =
-      std::numeric_limits<typename TensorFactory<DTYPE>::ctype>::infinity();
-  std::vector<ClampTestCase<DTYPE>> test_cases = {
-      {
-          std::string(__func__) + ": Simple negative/positive clamp",
-          {2, 2}, // sizes
-          {-10.1, -1.1, 1.1, 10.1}, // input_data
-          OptScalar(-5.5), // min
-          OptScalar(5.5), // max
-          {-5.5, -1.1, 1.1, 5.5}, // expected_data
-      },
-      {
-          std::string(__func__) + ": Simple negative-only clamp",
-          {2, 2}, // sizes
-          {-10.1, -5.5, 1.1, 10.1}, // input_data
-          OptScalar(-6.6), // min
-          OptScalar(-1.1), // max
-          {-6.6, -5.5, -1.1, -1.1}, // expected_data
-      },
-      {
-          std::string(__func__) + ": Infinities are clamped",
-          {2, 2}, // sizes
-          {-kInfinity, -1.1, 1.1, kInfinity}, // input_data
-          OptScalar(-5.5), // min
-          OptScalar(5.5), // max
-          {-5.5, -1.1, 1.1, 5.5}, // expected_data
-      },
-      {
-          std::string(__func__) + ": Infinite min",
-          {2, 2}, // sizes
-          {-10.1, -1.1, 1.1, 10.1}, // input_data
-          OptScalar(-kInfinity), // min
-          OptScalar(5.5), // max
-          {-10.1, -1.1, 1.1, 5.5}, // expected_data
-      },
-      {
-          std::string(__func__) + ": Infinite max",
-          {2, 2}, // sizes
-          {-10.1, -1.1, 1.1, 10.1}, // input_data
-          OptScalar(-5.5), // min
-          OptScalar(kInfinity), // max
-          {-5.5, -1.1, 1.1, 10.1}, // expected_data
-      },
-      {
-          std::string(__func__) + ": NaN entries preserved",
-          {2, 2}, // sizes
-          {-10.1, NAN, NAN, 10.1}, // input_data
-          OptScalar(0.0), // min
-          OptScalar(0.0), // max
-          {0.0, NAN, NAN, 0.0}, // expected_data
-      },
-      {
-          std::string(__func__) + ": NaN min produces all NaN output",
-          {2, 2}, // sizes
-          {-10.1, -1.1, 1.1, 10.1}, // input_data
-          OptScalar(NAN), // min
-          OptScalar(5.5), // max
-          {NAN, NAN, NAN, NAN}, // expected_data
-      },
-      {
-          std::string(__func__) + ": NaN max produces all NaN output",
-          {2, 2}, // sizes
-          {-10.1, -1.1, 1.1, 10.1}, // input_data
-          OptScalar(-5.5), // min
-          OptScalar(NAN), // max
-          {NAN, NAN, NAN, NAN}, // expected_data
-      },
-  };
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_clamp_out(in, /*min=*/nullopt, /*max=*/nullopt, out));
+  }
+};
 
-  run_test_cases(test_cases);
-}
+class OpClampTensorOutTest : public OperatorTest {
+ protected:
+  Tensor& op_clamp_tensor_out(
+      const Tensor& self,
+      const optional<Tensor>& min,
+      const optional<Tensor>& max,
+      Tensor& out) {
+    exec_aten::RuntimeContext context{};
+    return torch::executor::aten::clamp_outf(context, self, min, max, out);
+  }
+};
 
-TEST(OpClampOutTest, ByteTensors) {
+/// Describes a test case, using tensors of the specified DTYPE.
+// Runs test cases that are compatible with uint8_t, and thus all other real
+// types. Cover the most cases here, since it's compatible with the most types.
+// Runs test cases that are compatible with int8_t, and thus all signed real
+TEST_F(OpClampOutTest, ByteTensors) {
   run_unsigned_integer_test_cases<ScalarType::Byte>();
 }
 
-TEST(OpClampOutTest, CharTensors) {
+TEST_F(OpClampOutTest, CharTensors) {
   run_unsigned_integer_test_cases<ScalarType::Char>();
   run_signed_integer_test_cases<ScalarType::Char>();
 }
 
-TEST(OpClampOutTest, ShortTensors) {
+TEST_F(OpClampOutTest, ShortTensors) {
   run_unsigned_integer_test_cases<ScalarType::Short>();
   run_signed_integer_test_cases<ScalarType::Short>();
 }
 
-TEST(OpClampOutTest, IntTensors) {
+TEST_F(OpClampOutTest, IntTensors) {
   run_unsigned_integer_test_cases<ScalarType::Int>();
   run_signed_integer_test_cases<ScalarType::Int>();
 }
 
-TEST(OpClampOutTest, LongTensors) {
+TEST_F(OpClampOutTest, LongTensors) {
   run_unsigned_integer_test_cases<ScalarType::Long>();
   run_signed_integer_test_cases<ScalarType::Long>();
 }
 
-TEST(OpClampOutTest, FloatTensors) {
+TEST_F(OpClampOutTest, FloatTensors) {
   // Note that the integer test cases test the situation where the min/max value
   // Scalars are integer types, demonstrating that floating point types can be
   // clamped to integer values.
@@ -263,7 +294,7 @@ TEST(OpClampOutTest, FloatTensors) {
   run_floating_point_test_cases<ScalarType::Float>();
 }
 
-TEST(OpClampOutTest, DoubleTensors) {
+TEST_F(OpClampOutTest, DoubleTensors) {
   // Note that the integer test cases test the situation where the min/max value
   // Scalars are integer types, demonstrating that floating point types can be
   // clamped to integer values.
@@ -272,27 +303,12 @@ TEST(OpClampOutTest, DoubleTensors) {
   run_floating_point_test_cases<ScalarType::Double>();
 }
 
-// Tries clamping a DTYPE tensor to the provided value and expects it to die.
-template <ScalarType DTYPE>
-void expect_bad_clamp_value_dies(Scalar bad_value) {
-  TensorFactory<DTYPE> tf;
-  Tensor in = tf.ones({2, 2});
-  Tensor out = tf.zeros({2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(
-      op_clamp_out(in, /*min=*/bad_value, /*max=*/nullopt, out));
-  ET_EXPECT_KERNEL_FAILURE(
-      op_clamp_out(in, /*min=*/nullopt, /*max=*/bad_value, out));
-  ET_EXPECT_KERNEL_FAILURE(
-      op_clamp_out(in, /*min=*/bad_value, /*max=*/bad_value, out));
-}
-
 //
 // Don't test every type, just a representative sample: unsigned int, signed
 // int, floating point.
 //
 
-TEST(OpClampOutTest, ByteTensorNegativeClampDies) {
+TEST_F(OpClampOutTest, ByteTensorNegativeClampDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle negative clamp on byte tensor";
   }
@@ -300,54 +316,44 @@ TEST(OpClampOutTest, ByteTensorNegativeClampDies) {
   expect_bad_clamp_value_dies<ScalarType::Byte>(-1);
 }
 
-TEST(OpClampOutTest, ByteTensorTooLargeClampDies) {
+TEST_F(OpClampOutTest, ByteTensorTooLargeClampDies) {
   // Cannot be represented by a uint8_t.
   expect_bad_clamp_value_dies<ScalarType::Byte>(256);
 }
 
-TEST(OpClampOutTest, ByteTensorFloatingPointClampDies) {
+TEST_F(OpClampOutTest, ByteTensorFloatingPointClampDies) {
   // Cannot be represented by a uint8_t.
   expect_bad_clamp_value_dies<ScalarType::Byte>(2.2);
 }
 
 #ifndef USE_ATEN_LIB
-TEST(OpClampOutTest, IntTensorTooSmallClampDies) {
+TEST_F(OpClampOutTest, IntTensorTooSmallClampDies) {
   // Cannot be represented by a int32_t.
   expect_bad_clamp_value_dies<ScalarType::Int>(-2147483649);
 }
 
-TEST(OpClampOutTest, IntTensorTooLargeClampDies) {
+TEST_F(OpClampOutTest, IntTensorTooLargeClampDies) {
   // Cannot be represented by a int32_t.
   expect_bad_clamp_value_dies<ScalarType::Int>(2147483648);
 }
 #endif
 
-TEST(OpClampOutTest, IntTensorFloatingPointClampDies) {
+TEST_F(OpClampOutTest, IntTensorFloatingPointClampDies) {
   // Cannot be represented by a uint32_t.
   expect_bad_clamp_value_dies<ScalarType::Int>(2.2);
 }
 
-TEST(OpClampOutTest, FloatTensorTooSmallClampDies) {
+TEST_F(OpClampOutTest, FloatTensorTooSmallClampDies) {
   // Cannot be represented by a float.
   expect_bad_clamp_value_dies<ScalarType::Float>(-3.41e+38);
 }
 
-TEST(OpClampOutTest, FloatTensorTooLargeClampDies) {
+TEST_F(OpClampOutTest, FloatTensorTooLargeClampDies) {
   // Cannot be represented by a float.
   expect_bad_clamp_value_dies<ScalarType::Float>(3.41e+38);
 }
 
-// One of min and max should be non-null
-void expect_both_min_max_null_die() {
-  TensorFactory<ScalarType::Float> tf;
-  Tensor in = tf.ones({2, 2});
-  Tensor out = tf.zeros({2, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(
-      op_clamp_out(in, /*min=*/nullopt, /*max=*/nullopt, out));
-}
-
-TEST(OpClampOutTest, SimpleGeneratedCase) {
+TEST_F(OpClampOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
@@ -378,7 +384,7 @@ TEST(OpClampOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpClampOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpClampOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
@@ -400,7 +406,7 @@ TEST(OpClampOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpClampOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpClampOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -423,7 +429,7 @@ TEST(OpClampOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpClampOutTest, DynamicShapeUnbound) {
+TEST_F(OpClampOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -446,7 +452,7 @@ TEST(OpClampOutTest, DynamicShapeUnbound) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpClampTensorOutTest, SmokeTest) {
+TEST_F(OpClampTensorOutTest, SmokeTest) {
   TensorFactory<ScalarType::Byte> tf_in;
   TensorFactory<ScalarType::Int> tf_min;
   TensorFactory<ScalarType::Char> tf_max;
diff --git a/kernels/test/op_clone_test.cpp b/kernels/test/op_clone_test.cpp
index f6dbf636e4..62f8c499f2 100644
--- a/kernels/test/op_clone_test.cpp
+++ b/kernels/test/op_clone_test.cpp
@@ -22,44 +22,57 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_clone_out(
-    const Tensor& self,
-    optional<MemoryFormat> memory_format,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::clone_outf(context, self, memory_format, out);
-}
+class OpCloneTest : public OperatorTest {
+ protected:
+  Tensor& op_clone_out(
+      const Tensor& self,
+      optional<MemoryFormat> memory_format,
+      Tensor& out) {
+    return torch::executor::aten::clone_outf(
+        context_, self, memory_format, out);
+  }
 
-// regular test for clone.out
-// test if clone.out works well under all kinds of legal input type.
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-  Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{2, 3, 2, 4, 1, 5, 1, 6});
-  Tensor out_nullopt = tf.zeros(/*sizes=*/{2, 4});
-  Tensor out_contiguous = tf.zeros(/*sizes=*/{2, 4});
-
-  // we only support contiguous memory, the memory type shall be either nullopt
-  // or MemoryFormat::Contiguous.
-  Tensor out_nullopt_ret = op_clone_out(
-      /*self=*/input,
-      /*memory_format=*/exec_aten::nullopt,
-      /*out=*/out_nullopt);
-  Tensor out_contiguous_ret = op_clone_out(
-      /*self=*/input,
-      /*memory_format=*/exec_aten::MemoryFormat::Contiguous,
-      /*out=*/out_contiguous);
-
-  // The original tensor a should share same value with the out variable and
-  // return variable of clone function
-  EXPECT_TENSOR_EQ(input, out_nullopt);
-  EXPECT_TENSOR_EQ(input, out_nullopt_ret);
-
-  EXPECT_TENSOR_EQ(input, out_contiguous);
-  EXPECT_TENSOR_EQ(input, out_contiguous_ret);
-}
+  // test if clone.out works well under all kinds of legal input type.
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+    Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{2, 3, 2, 4, 1, 5, 1, 6});
+    Tensor out_nullopt = tf.zeros(/*sizes=*/{2, 4});
+    Tensor out_contiguous = tf.zeros(/*sizes=*/{2, 4});
+
+    // we only support contiguous memory, the memory type shall be either
+    // nullopt or MemoryFormat::Contiguous.
+    Tensor out_nullopt_ret = op_clone_out(
+        /*self=*/input,
+        /*memory_format=*/exec_aten::nullopt,
+        /*out=*/out_nullopt);
+    Tensor out_contiguous_ret = op_clone_out(
+        /*self=*/input,
+        /*memory_format=*/exec_aten::MemoryFormat::Contiguous,
+        /*out=*/out_contiguous);
+
+    // The original tensor a should share same value with the out variable and
+    // return variable of clone function
+    EXPECT_TENSOR_EQ(input, out_nullopt);
+    EXPECT_TENSOR_EQ(input, out_nullopt_ret);
+
+    EXPECT_TENSOR_EQ(input, out_contiguous);
+    EXPECT_TENSOR_EQ(input, out_contiguous_ret);
+  }
 
-TEST(OpCloneTest, AllDtypesSupported) {
+  template <class CTYPE, ScalarType DTYPE>
+  void test_empty_input() {
+    TensorFactory<DTYPE> tf;
+    Tensor input = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
+    Tensor out = tf.zeros({3, 0, 1, 2});
+    op_clone_out(input, /*memory_format=*/exec_aten::nullopt, out);
+    // check a and out share same value, but are different object
+    EXPECT_TENSOR_EQ(input, out);
+  }
+};
+
+// regular test for clone.out
+TEST_F(OpCloneTest, AllDtypesSupported) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -68,23 +81,13 @@ TEST(OpCloneTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
-template <class CTYPE, ScalarType DTYPE>
-void test_empty_input() {
-  TensorFactory<DTYPE> tf;
-  Tensor input = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
-  Tensor out = tf.zeros({3, 0, 1, 2});
-  op_clone_out(input, /*memory_format=*/exec_aten::nullopt, out);
-  // check a and out share same value, but are different object
-  EXPECT_TENSOR_EQ(input, out);
-}
-
-TEST(OpCloneTest, EmptyInputSupported) {
+TEST_F(OpCloneTest, EmptyInputSupported) {
 #define TEST_ENTRY(ctype, dtype) test_empty_input<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpCloneTest, MismatchedSizesDie) {
+TEST_F(OpCloneTest, MismatchedSizesDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched sizes";
   }
@@ -92,23 +95,23 @@ TEST(OpCloneTest, MismatchedSizesDie) {
   Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf.zeros({3, 2, 1, 1});
   ET_EXPECT_KERNEL_FAILURE(
-      op_clone_out(input, /*memory_format=*/exec_aten::nullopt, out));
+      context_, op_clone_out(input, /*memory_format=*/exec_aten::nullopt, out));
 }
 
-TEST(OpCloneTest, MismatchedTypesDie) {
+TEST_F(OpCloneTest, MismatchedTypesDie) {
   TensorFactory<ScalarType::Int> tf_in;
   TensorFactory<ScalarType::Float> tf_out;
   Tensor input =
       tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf_out.zeros({3, 1, 1, 2});
   ET_EXPECT_KERNEL_FAILURE(
-      op_clone_out(input, /*memory_format=*/exec_aten::nullopt, out));
+      context_, op_clone_out(input, /*memory_format=*/exec_aten::nullopt, out));
 }
 
 // Only contiguous memory is supported, the memory type other than nullopt or
 // MemoryFormat::Contiguous should not be allowed. The function is expected
 // depth if using the illegal memory format.
-TEST(OpCloneTest, MismatchedMemoryFormatDie) {
+TEST_F(OpCloneTest, MismatchedMemoryFormatDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle non contiguous memory formats";
   }
@@ -118,10 +121,11 @@ TEST(OpCloneTest, MismatchedMemoryFormatDie) {
       tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf_out.zeros({3, 1, 1, 2});
   ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_clone_out(input, static_cast<exec_aten::MemoryFormat>(55), out));
 }
 
-TEST(OpCloneTest, SimpleGeneratedCase) {
+TEST_F(OpCloneTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -150,7 +154,7 @@ TEST(OpCloneTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpCloneTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpCloneTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -176,7 +180,7 @@ TEST(OpCloneTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpCloneTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpCloneTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -202,7 +206,7 @@ TEST(OpCloneTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpCloneTest, DynamicShapeUnbound) {
+TEST_F(OpCloneTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_constant_pad_nd_test.cpp b/kernels/test/op_constant_pad_nd_test.cpp
index 8024b9b8f1..5ddc310c89 100644
--- a/kernels/test/op_constant_pad_nd_test.cpp
+++ b/kernels/test/op_constant_pad_nd_test.cpp
@@ -23,331 +23,333 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_constant_pad_nd_out(
-    const Tensor& self,
-    const IntArrayRef padding,
-    const Scalar& value,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::constant_pad_nd_outf(
-      context, self, padding, value, out);
-}
-
-template <ScalarType DTYPE>
-void test_constant_pad_nd_out_dim2() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {2, 4, 4};
-  const std::vector<int32_t> sizes_out = {2, 4, 6};
-  const std::vector<int64_t> padding = {1, 1};
-
-  // clang-format off
-  Tensor self = tf.make(
-      sizes,
-      {
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor expected = tf.make(
-      sizes_out,
-      {
-         7,  1,  2,  3,  4,  7,
-         7,  5,  6,  7,  8,  7,
-         7,  1,  2,  3,  4,  7,
-         7,  5,  6,  7,  8,  7,
-
-         7,  1,  2,  3,  4,  7,
-         7,  5,  6,  7,  8,  7,
-         7,  1,  2,  3,  4,  7,
-         7,  5,  6,  7,  8,  7,
-      });
-  // clang-format on
-
-  IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
-  Tensor out = tf.zeros(sizes_out);
-
-  // Valid input should give the expected output
-  op_constant_pad_nd_out(self, padding_ref, 7, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
-template <ScalarType DTYPE>
-void test_constant_pad_nd_out_dim1() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {2, 4, 4};
-  const std::vector<int32_t> sizes_out = {2, 6, 4};
-  const std::vector<int64_t> padding = {0, 0, 2, 0};
-
-  // clang-format off
-  Tensor self = tf.make(
-      sizes,
-      {
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor expected = tf.make(
-      sizes_out,
-      {
-         7,  7,  7,  7,
-         7,  7,  7,  7,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-
-         7,  7,  7,  7,
-         7,  7,  7,  7,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-      });
-  // clang-format on
-
-  IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
-  Tensor out = tf.zeros(sizes_out);
-
-  // Valid input should give the expected output
-  op_constant_pad_nd_out(self, padding_ref, 7, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
-template <ScalarType DTYPE>
-void test_constant_pad_nd_out_dim0() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {2, 4, 4};
-  const std::vector<int32_t> sizes_out = {3, 4, 4};
-  const std::vector<int64_t> padding = {0, 0, 0, 0, 1, 0};
-
-  // clang-format off
-  Tensor self = tf.make(
-      sizes,
-      {
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor expected = tf.make(
-      sizes_out,
-      {
-         7,  7,  7,  7,
-         7,  7,  7,  7,
-         7,  7,  7,  7,
-         7,  7,  7,  7,
-
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-      });
-  // clang-format on
-
-  IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
-  Tensor out = tf.zeros(sizes_out);
-
-  // Valid input should give the expected output
-  op_constant_pad_nd_out(self, padding_ref, 7, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
-template <ScalarType DTYPE>
-void test_constant_pad_nd_out_dim12() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {2, 4, 4};
-  const std::vector<int32_t> sizes_out = {2, 6, 7};
-  const std::vector<int64_t> padding = {2, 1, 0, 2};
-
-  // clang-format off
-  Tensor self = tf.make(
-      sizes,
-      {
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor expected = tf.make(
-      sizes_out,
-      {
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  7,  7,  7,  7,  7,
-
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  7,  7,  7,  7,  7,
-      });
-  // clang-format on
-
-  IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
-  Tensor out = tf.zeros(sizes_out);
-
-  // Valid input should give the expected output
-  op_constant_pad_nd_out(self, padding_ref, 7, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
+class OpConstantPadNDOutTest : public OperatorTest {
+ protected:
+  Tensor& op_constant_pad_nd_out(
+      const Tensor& self,
+      const IntArrayRef padding,
+      const Scalar& value,
+      Tensor& out) {
+    return torch::executor::aten::constant_pad_nd_outf(
+        context_, self, padding, value, out);
+  }
 
-template <ScalarType DTYPE>
-void test_constant_pad_nd_out_dim02() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {2, 4, 4};
-  const std::vector<int32_t> sizes_out = {3, 4, 7};
-  const std::vector<int64_t> padding = {2, 1, 0, 0, 0, 1};
-
-  // clang-format off
-  Tensor self = tf.make(
-      sizes,
-      {
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor expected = tf.make(
-      sizes_out,
-      {
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  7,  7,  7,  7,  7,
-      });
-  // clang-format on
+  template <ScalarType DTYPE>
+  void test_constant_pad_nd_out_dim2() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {2, 4, 4};
+    const std::vector<int32_t> sizes_out = {2, 4, 6};
+    const std::vector<int64_t> padding = {1, 1};
+
+    // clang-format off
+    Tensor self = tf.make(
+        sizes,
+        {
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+  
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+        });
+    // clang-format on
+
+    // clang-format off
+    Tensor expected = tf.make(
+        sizes_out,
+        {
+           7,  1,  2,  3,  4,  7,
+           7,  5,  6,  7,  8,  7,
+           7,  1,  2,  3,  4,  7,
+           7,  5,  6,  7,  8,  7,
+  
+           7,  1,  2,  3,  4,  7,
+           7,  5,  6,  7,  8,  7,
+           7,  1,  2,  3,  4,  7,
+           7,  5,  6,  7,  8,  7,
+        });
+    // clang-format on
+
+    IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
+    Tensor out = tf.zeros(sizes_out);
+
+    // Valid input should give the expected output
+    op_constant_pad_nd_out(self, padding_ref, 7, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
 
-  IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
-  Tensor out = tf.zeros(sizes_out);
+  template <ScalarType DTYPE>
+  void test_constant_pad_nd_out_dim1() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {2, 4, 4};
+    const std::vector<int32_t> sizes_out = {2, 6, 4};
+    const std::vector<int64_t> padding = {0, 0, 2, 0};
+
+    // clang-format off
+    Tensor self = tf.make(
+        sizes,
+        {
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+  
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+        });
+    // clang-format on
+
+    // clang-format off
+    Tensor expected = tf.make(
+        sizes_out,
+        {
+           7,  7,  7,  7,
+           7,  7,  7,  7,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+  
+           7,  7,  7,  7,
+           7,  7,  7,  7,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+        });
+    // clang-format on
+
+    IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
+    Tensor out = tf.zeros(sizes_out);
+
+    // Valid input should give the expected output
+    op_constant_pad_nd_out(self, padding_ref, 7, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
 
-  // Valid input should give the expected output
-  op_constant_pad_nd_out(self, padding_ref, 7, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
+  template <ScalarType DTYPE>
+  void test_constant_pad_nd_out_dim0() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {2, 4, 4};
+    const std::vector<int32_t> sizes_out = {3, 4, 4};
+    const std::vector<int64_t> padding = {0, 0, 0, 0, 1, 0};
+
+    // clang-format off
+    Tensor self = tf.make(
+        sizes,
+        {
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+  
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+        });
+    // clang-format on
+
+    // clang-format off
+    Tensor expected = tf.make(
+        sizes_out,
+        {
+           7,  7,  7,  7,
+           7,  7,  7,  7,
+           7,  7,  7,  7,
+           7,  7,  7,  7,
+  
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+  
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+        });
+    // clang-format on
+
+    IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
+    Tensor out = tf.zeros(sizes_out);
+
+    // Valid input should give the expected output
+    op_constant_pad_nd_out(self, padding_ref, 7, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
 
-template <ScalarType DTYPE>
-void test_constant_pad_nd_out_dim012() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {2, 4, 4};
-  const std::vector<int32_t> sizes_out = {3, 5, 7};
-  const std::vector<int64_t> padding = {2, 1, 1, 0, 0, 1};
-
-  // clang-format off
-  Tensor self = tf.make(
-      sizes,
-      {
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor expected = tf.make(
-      sizes_out,
-      {
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-         7,  7,  1,  2,  3,  4,  7,
-         7,  7,  5,  6,  7,  8,  7,
-
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  7,  7,  7,  7,  7,
-         7,  7,  7,  7,  7,  7,  7,
-      });
-  // clang-format on
+  template <ScalarType DTYPE>
+  void test_constant_pad_nd_out_dim12() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {2, 4, 4};
+    const std::vector<int32_t> sizes_out = {2, 6, 7};
+    const std::vector<int64_t> padding = {2, 1, 0, 2};
+
+    // clang-format off
+    Tensor self = tf.make(
+        sizes,
+        {
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+  
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+        });
+    // clang-format on
+
+    // clang-format off
+    Tensor expected = tf.make(
+        sizes_out,
+        {
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  7,  7,  7,  7,  7,
+  
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  7,  7,  7,  7,  7,
+        });
+    // clang-format on
+
+    IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
+    Tensor out = tf.zeros(sizes_out);
+
+    // Valid input should give the expected output
+    op_constant_pad_nd_out(self, padding_ref, 7, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
 
-  IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
-  Tensor out = tf.zeros(sizes_out);
+  template <ScalarType DTYPE>
+  void test_constant_pad_nd_out_dim02() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {2, 4, 4};
+    const std::vector<int32_t> sizes_out = {3, 4, 7};
+    const std::vector<int64_t> padding = {2, 1, 0, 0, 0, 1};
+
+    // clang-format off
+    Tensor self = tf.make(
+        sizes,
+        {
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+  
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+        });
+    // clang-format on
+
+    // clang-format off
+    Tensor expected = tf.make(
+        sizes_out,
+        {
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+  
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+  
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  7,  7,  7,  7,  7,
+        });
+    // clang-format on
+
+    IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
+    Tensor out = tf.zeros(sizes_out);
+
+    // Valid input should give the expected output
+    op_constant_pad_nd_out(self, padding_ref, 7, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
 
-  // Valid input should give the expected output
-  op_constant_pad_nd_out(self, padding_ref, 7, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
+  template <ScalarType DTYPE>
+  void test_constant_pad_nd_out_dim012() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {2, 4, 4};
+    const std::vector<int32_t> sizes_out = {3, 5, 7};
+    const std::vector<int64_t> padding = {2, 1, 1, 0, 0, 1};
+
+    // clang-format off
+    Tensor self = tf.make(
+        sizes,
+        {
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+  
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+           1,  2,  3,  4,
+           5,  6,  7,  8,
+        });
+    // clang-format on
+
+    // clang-format off
+    Tensor expected = tf.make(
+        sizes_out,
+        {
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+  
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+           7,  7,  1,  2,  3,  4,  7,
+           7,  7,  5,  6,  7,  8,  7,
+  
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  7,  7,  7,  7,  7,
+           7,  7,  7,  7,  7,  7,  7,
+        });
+    // clang-format on
+
+    IntArrayRef padding_ref = IntArrayRef(padding.data(), padding.size());
+    Tensor out = tf.zeros(sizes_out);
+
+    // Valid input should give the expected output
+    op_constant_pad_nd_out(self, padding_ref, 7, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+};
 
-TEST(OpConstantPadNDOutKernelTest, TestPadDim2) {
+TEST_F(OpConstantPadNDOutTest, TestPadDim2) {
 #define TEST_ENTRY(ctype, dtype) \
   test_constant_pad_nd_out_dim2<ScalarType::dtype>();
 
@@ -355,7 +357,7 @@ TEST(OpConstantPadNDOutKernelTest, TestPadDim2) {
 #undef TEST_ENTRY
 }
 
-TEST(OpConstantPadNDOutKernelTest, TestPadDim1) {
+TEST_F(OpConstantPadNDOutTest, TestPadDim1) {
 #define TEST_ENTRY(ctype, dtype) \
   test_constant_pad_nd_out_dim1<ScalarType::dtype>();
 
@@ -363,7 +365,7 @@ TEST(OpConstantPadNDOutKernelTest, TestPadDim1) {
 #undef TEST_ENTRY
 }
 
-TEST(OpConstantPadNDOutKernelTest, TestPadDim0) {
+TEST_F(OpConstantPadNDOutTest, TestPadDim0) {
 #define TEST_ENTRY(ctype, dtype) \
   test_constant_pad_nd_out_dim0<ScalarType::dtype>();
 
@@ -371,7 +373,7 @@ TEST(OpConstantPadNDOutKernelTest, TestPadDim0) {
 #undef TEST_ENTRY
 }
 
-TEST(OpConstantPadNDOutKernelTest, TestPadDim1And2) {
+TEST_F(OpConstantPadNDOutTest, TestPadDim1And2) {
 #define TEST_ENTRY(ctype, dtype) \
   test_constant_pad_nd_out_dim12<ScalarType::dtype>();
 
@@ -379,7 +381,7 @@ TEST(OpConstantPadNDOutKernelTest, TestPadDim1And2) {
 #undef TEST_ENTRY
 }
 
-TEST(OpConstantPadNDOutKernelTest, TestPadDim0And2) {
+TEST_F(OpConstantPadNDOutTest, TestPadDim0And2) {
 #define TEST_ENTRY(ctype, dtype) \
   test_constant_pad_nd_out_dim02<ScalarType::dtype>();
 
@@ -387,7 +389,7 @@ TEST(OpConstantPadNDOutKernelTest, TestPadDim0And2) {
 #undef TEST_ENTRY
 }
 
-TEST(OpConstantPadNDOutKernelTest, TestPadDim0And1And2) {
+TEST_F(OpConstantPadNDOutTest, TestPadDim0And1And2) {
 #define TEST_ENTRY(ctype, dtype) \
   test_constant_pad_nd_out_dim012<ScalarType::dtype>();
 
@@ -395,7 +397,7 @@ TEST(OpConstantPadNDOutKernelTest, TestPadDim0And1And2) {
 #undef TEST_ENTRY
 }
 
-TEST(OpConstantPadNDOutKernelTest, DifferentInputOutputTypesFail) {
+TEST_F(OpConstantPadNDOutTest, DifferentInputOutputTypesFail) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Double> tf_out;
 
@@ -408,10 +410,11 @@ TEST(OpConstantPadNDOutKernelTest, DifferentInputOutputTypesFail) {
   Tensor self = tf.ones(sizes);
   Tensor out = tf_out.zeros(sizes_out);
 
-  ET_EXPECT_KERNEL_FAILURE(op_constant_pad_nd_out(self, padding_ref, 0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_constant_pad_nd_out(self, padding_ref, 0, out));
 }
 
-TEST(OpConstantPadNDOutKernelTest, OddNumberOfPaddingElementsFail) {
+TEST_F(OpConstantPadNDOutTest, OddNumberOfPaddingElementsFail) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {1, 4, 4};
@@ -423,10 +426,11 @@ TEST(OpConstantPadNDOutKernelTest, OddNumberOfPaddingElementsFail) {
   Tensor self = tf.ones(sizes);
   Tensor out = tf.zeros(sizes_out);
 
-  ET_EXPECT_KERNEL_FAILURE(op_constant_pad_nd_out(self, padding_ref, 0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_constant_pad_nd_out(self, padding_ref, 0, out));
 }
 
-TEST(OpConstantPadNDOutKernelTest, TooManyPaddingElementsFail) {
+TEST_F(OpConstantPadNDOutTest, TooManyPaddingElementsFail) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {1, 4, 4};
@@ -438,10 +442,11 @@ TEST(OpConstantPadNDOutKernelTest, TooManyPaddingElementsFail) {
   Tensor self = tf.ones(sizes);
   Tensor out = tf.zeros(sizes_out);
 
-  ET_EXPECT_KERNEL_FAILURE(op_constant_pad_nd_out(self, padding_ref, 0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_constant_pad_nd_out(self, padding_ref, 0, out));
 }
 
-TEST(OpConstantPadNDOutKernelTest, IncorrectOutputShapeFail) {
+TEST_F(OpConstantPadNDOutTest, IncorrectOutputShapeFail) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle reshape output";
   }
@@ -457,5 +462,6 @@ TEST(OpConstantPadNDOutKernelTest, IncorrectOutputShapeFail) {
   Tensor self = tf.ones(sizes);
   Tensor out = tf.zeros(sizes_out);
 
-  ET_EXPECT_KERNEL_FAILURE(op_constant_pad_nd_out(self, padding_ref, 0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_constant_pad_nd_out(self, padding_ref, 0, out));
 }
diff --git a/kernels/test/op_convolution_test.cpp b/kernels/test/op_convolution_test.cpp
index 350d573d5e..3e641b70fc 100644
--- a/kernels/test/op_convolution_test.cpp
+++ b/kernels/test/op_convolution_test.cpp
@@ -22,67 +22,130 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_convolution_out(
-    const Tensor& input,
-    const Tensor& weight,
-    const optional<Tensor>& bias,
-    ArrayRef<int64_t> stride,
-    ArrayRef<int64_t> padding,
-    ArrayRef<int64_t> dilation,
-    bool transposed,
-    ArrayRef<int64_t> output_padding,
-    int64_t groups,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::convolution_outf(
-      context,
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      transposed,
-      output_padding,
-      groups,
-      out);
-}
+class OpConvOutTest : public OperatorTest {
+ protected:
+  Tensor& op_convolution_out(
+      const Tensor& input,
+      const Tensor& weight,
+      const optional<Tensor>& bias,
+      ArrayRef<int64_t> stride,
+      ArrayRef<int64_t> padding,
+      ArrayRef<int64_t> dilation,
+      bool transposed,
+      ArrayRef<int64_t> output_padding,
+      int64_t groups,
+      Tensor& out) {
+    return torch::executor::aten::convolution_outf(
+        context_,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        out);
+  }
 
-/* Correctness Test Template for test code generation via Python */
-/* %python
-correctness_test_template = f"""
-  {declare_tensor_factory("ScalarType::$DTYPE$", "tf")}
+  /* Correctness Test Template for test code generation via Python */
+  /* %python
+  correctness_test_template = f"""
+    {declare_tensor_factory("ScalarType::$DTYPE$", "tf")}
+
+    {declare_tensor_make_t("input", "tf")}
+    {declare_tensor_make_t("weight", "tf")}
+    {declare_optional_tensor_make_t("bias", "tf")}
+    {declare_tensor_make_t("expected", "tf")}
+    Tensor out = tf.zeros($out_size$, $dynamism$);
+
+    {declare_array_ref_t("stride", "int64_t")}
+    {declare_array_ref_t("padding", "int64_t")}
+    {declare_array_ref_t("dilation", "int64_t")}
+    {declare_array_ref_t("output_padding", "int64_t")}
+
+    op_convolution_out(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        $transposed$,
+        output_padding,
+        $groups$,
+        out);
+    EXPECT_TENSOR_CLOSE(out, expected);"""
+  */
 
-  {declare_tensor_make_t("input", "tf")}
-  {declare_tensor_make_t("weight", "tf")}
-  {declare_optional_tensor_make_t("bias", "tf")}
-  {declare_tensor_make_t("expected", "tf")}
-  Tensor out = tf.zeros($out_size$, $dynamism$);
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  input = (torch.randint(10, 100, (1, 2, 5)).to(torch.double) / 10.0);
+  weight = (torch.randint(10, 100, (4, 2, 3)).to(torch.double) / 10.0);
+  bias = torch.ones(4).to(torch.double)
+  stride = [2]
+  padding = [0]
+  dilation = [1]
+  transposed = False
+  output_padding = [0]
+  groups = 1
+  expected = torch.nn.functional.conv1d(
+      input, weight, bias, stride, padding, dilation, groups)
+
+  DTYPE = "Float"
+  out_size = "out_shape"
+  dynamism = "dynamism"
+  */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %past-rewrite(correctness_test_template) */
+
+    TensorFactory<ScalarType::Float> tf;
+
+    Tensor input =
+        tf.make({1, 2, 5}, {5.4, 1.9, 9.3, 7.0, 5.3, 7.9, 1.7, 8.3, 4.7, 7.3});
+    Tensor weight =
+        tf.make({4, 2, 3}, {8.1, 6.6, 1.6, 4.9, 3.8, 6.6, 4.6, 2.8,
+                            2.4, 1.3, 3.6, 3.9, 8.1, 8.4, 5.4, 5.1,
+                            8.9, 9.9, 7.9, 1.0, 1.1, 8.2, 6.3, 7.0});
+    optional<Tensor> bias(tf.make({4}, {1.0, 1.0, 1.0, 1.0}));
+    Tensor expected = tf.make(
+        {1, 4, 2},
+        {172.11, 237.72, 102.24, 132.28, 248.51, 320.18, 189.38, 236.07});
+    Tensor out = tf.zeros(out_shape, dynamism);
+
+    int64_t stride[] = {2};
+    int64_t padding[] = {0};
+    int64_t dilation[] = {1};
+    int64_t output_padding[] = {0};
+
+    op_convolution_out(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        false,
+        output_padding,
+        1,
+        out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+};
 
-  {declare_array_ref_t("stride", "int64_t")}
-  {declare_array_ref_t("padding", "int64_t")}
-  {declare_array_ref_t("dilation", "int64_t")}
-  {declare_array_ref_t("output_padding", "int64_t")}
-
-  op_convolution_out(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      $transposed$,
-      output_padding,
-      $groups$,
-      out);
-  EXPECT_TENSOR_CLOSE(out, expected);"""
-*/
+class OpConvCorrectnessTest : public OpConvOutTest {};
 
 //
 // Correctness Tests
 //
 
-TEST(OpConvCorrectnessTest, GenericSmokeTest) {
+TEST_F(OpConvCorrectnessTest, GenericSmokeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   auto input = tf.make({1, 2, 5}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
@@ -131,7 +194,7 @@ DTYPE = "Float"
 out_size = expected.size()
 dynamism = "torch::executor::TensorShapeDynamism::STATIC"
 */
-TEST(OpConvCorrectnessTest, NonZeroPadding) {
+TEST_F(OpConvCorrectnessTest, NonZeroPadding) {
   /* %python
   %past-rewrite(correctness_test_template) */
 
@@ -202,7 +265,7 @@ DTYPE = "Float"
 out_size = expected.size()
 dynamism = "torch::executor::TensorShapeDynamism::STATIC"
 */
-TEST(OpConvCorrectnessTest, MultipleInputBatches) {
+TEST_F(OpConvCorrectnessTest, MultipleInputBatches) {
   /* %python
   %past-rewrite(correctness_test_template) */
 
@@ -264,7 +327,7 @@ DTYPE = "Float"
 out_size = expected.size()
 dynamism = "torch::executor::TensorShapeDynamism::STATIC"
 */
-TEST(OpConvCorrectnessTest, 2DSanityCheck) {
+TEST_F(OpConvCorrectnessTest, 2DSanityCheck) {
   /* %python
   %past-rewrite(correctness_test_template) */
 
@@ -328,7 +391,7 @@ TEST(OpConvCorrectnessTest, 2DSanityCheck) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST(OpConvCorrectnessTest, 2DSanityCheckChannelsLast) {
+TEST_F(OpConvCorrectnessTest, 2DSanityCheckChannelsLast) {
   /* %python
   %past-rewrite(correctness_test_template) */
 
@@ -392,75 +455,17 @@ TEST(OpConvCorrectnessTest, 2DSanityCheckChannelsLast) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-/* %python
-import torch
-torch.manual_seed(0)
-input = (torch.randint(10, 100, (1, 2, 5)).to(torch.double) / 10.0);
-weight = (torch.randint(10, 100, (4, 2, 3)).to(torch.double) / 10.0);
-bias = torch.ones(4).to(torch.double)
-stride = [2]
-padding = [0]
-dilation = [1]
-transposed = False
-output_padding = [0]
-groups = 1
-expected = torch.nn.functional.conv1d(
-    input, weight, bias, stride, padding, dilation, groups)
-
-DTYPE = "Float"
-out_size = "out_shape"
-dynamism = "dynamism"
-*/
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %past-rewrite(correctness_test_template) */
-
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor input =
-      tf.make({1, 2, 5}, {5.4, 1.9, 9.3, 7.0, 5.3, 7.9, 1.7, 8.3, 4.7, 7.3});
-  Tensor weight = tf.make(
-      {4, 2, 3}, {8.1, 6.6, 1.6, 4.9, 3.8, 6.6, 4.6, 2.8, 2.4, 1.3, 3.6, 3.9,
-                  8.1, 8.4, 5.4, 5.1, 8.9, 9.9, 7.9, 1.0, 1.1, 8.2, 6.3, 7.0});
-  optional<Tensor> bias(tf.make({4}, {1.0, 1.0, 1.0, 1.0}));
-  Tensor expected = tf.make(
-      {1, 4, 2},
-      {172.11, 237.72, 102.24, 132.28, 248.51, 320.18, 189.38, 236.07});
-  Tensor out = tf.zeros(out_shape, dynamism);
-
-  int64_t stride[] = {2};
-  int64_t padding[] = {0};
-  int64_t dilation[] = {1};
-  int64_t output_padding[] = {0};
-
-  op_convolution_out(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      false,
-      output_padding,
-      1,
-      out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
-TEST(OpConvOut, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpConvOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {1, 4, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpConvOut, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpConvOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpConvOut, DynamicShapeUnbound) {
+TEST_F(OpConvOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_copy_test.cpp b/kernels/test/op_copy_test.cpp
index 3ecef71c6e..82332f85eb 100644
--- a/kernels/test/op_copy_test.cpp
+++ b/kernels/test/op_copy_test.cpp
@@ -22,74 +22,120 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_copy_out(
-    const Tensor& self,
-    const Tensor& src,
-    bool non_blocking,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::copy_outf(
-      context, self, src, non_blocking, out);
-}
+class OpCopyTest : public OperatorTest {
+ protected:
+  Tensor& op_copy_out(
+      const Tensor& self,
+      const Tensor& src,
+      bool non_blocking,
+      Tensor& out) {
+    return torch::executor::aten::copy_outf(
+        context_, self, src, non_blocking, out);
+  }
+
+  // test if copy.out works well under all kinds of legal input type.
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+    Tensor self = tf.make(/*sizes=*/{2, 4}, /*data=*/{2, 3, 2, 4, 1, 5, 1, 6});
+    Tensor src = tf.make(/*sizes=*/{2, 4}, /*data=*/{2, 3, 2, 4, 1, 5, 1, 6});
+    bool non_blocking = false;
+    Tensor out_nullopt = tf.zeros(/*sizes=*/{2, 4});
+    Tensor out_contiguous = tf.zeros(/*sizes=*/{2, 4});
+
+    // we only support contiguous memory, the memory type shall be either
+    // nullopt or MemoryFormat::Contiguous.
+    Tensor out_nullopt_ret = op_copy_out(
+        /*self=*/self,
+        /*src=*/src,
+        /*non_blocking=*/non_blocking,
+        /*out=*/out_nullopt);
+    Tensor out_contiguous_ret = op_copy_out(
+        /*self=*/self,
+        /*src=*/src,
+        /*non_blocking=*/non_blocking,
+        /*out=*/out_contiguous);
+
+    // The original tensor a should share same value with the out variable and
+    // return variable of copy function
+    EXPECT_TENSOR_EQ(src, out_nullopt);
+    EXPECT_TENSOR_EQ(src, out_nullopt_ret);
+
+    EXPECT_TENSOR_EQ(src, out_contiguous);
+    EXPECT_TENSOR_EQ(src, out_contiguous_ret);
+  }
+
+  template <class CTYPE, ScalarType DTYPE>
+  void test_empty_input() {
+    TensorFactory<DTYPE> tf;
+    Tensor self = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
+    Tensor src = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
+    bool non_blocking = false;
+    Tensor out = tf.zeros({3, 0, 1, 2});
+    op_copy_out(self, src, non_blocking, out);
+    // check a and out share same value, but are different object
+    EXPECT_TENSOR_EQ(src, out);
+  }
+
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  self = torch.randint(10, (3, 4))
+  src = torch.randint(10, (3, 4))
+  non_blocking = False
+  expected = src
+  out_args = "out_shape, dynamism"
+
+  copy_template = f"""
+    {declare_tensor_factory("ScalarType::Int", "tf")}
+
+    {declare_tensor_make_t("self", "tf")}
+    {declare_tensor_make_t("src", "tf")}
+    {declare_tensor_make_t("expected", "tf")}
+    {declare_tensor_zeros("out_shape, dynamism", "tf", "out")}
+
+    op_copy_out(self, src, $non_blocking$, out);
+    EXPECT_TENSOR_EQ(out, expected);""" */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(copy_template) */
+
+    TensorFactory<ScalarType::Int> tf;
+
+    Tensor self = tf.make({3, 4}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6});
+    Tensor src = tf.make({3, 4}, {6, 9, 8, 6, 6, 8, 4, 3, 6, 9, 1, 4});
+    Tensor expected = tf.make({3, 4}, {6, 9, 8, 6, 6, 8, 4, 3, 6, 9, 1, 4});
+    Tensor out = tf.zeros(out_shape, dynamism);
+
+    op_copy_out(self, src, false, out);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+class OpCopyInplaceTest : public OperatorTest {
+ protected:
+  Tensor& op_copy_(Tensor& self, const Tensor& src, bool non_blocking) {
+    return torch::executor::aten::copy_(context_, self, src, non_blocking);
+  }
+};
 
 // regular test for copy.out
-// test if copy.out works well under all kinds of legal input type.
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-  Tensor self = tf.make(/*sizes=*/{2, 4}, /*data=*/{2, 3, 2, 4, 1, 5, 1, 6});
-  Tensor src = tf.make(/*sizes=*/{2, 4}, /*data=*/{2, 3, 2, 4, 1, 5, 1, 6});
-  bool non_blocking = false;
-  Tensor out_nullopt = tf.zeros(/*sizes=*/{2, 4});
-  Tensor out_contiguous = tf.zeros(/*sizes=*/{2, 4});
-
-  // we only support contiguous memory, the memory type shall be either nullopt
-  // or MemoryFormat::Contiguous.
-  Tensor out_nullopt_ret = op_copy_out(
-      /*self=*/self,
-      /*src=*/src,
-      /*non_blocking=*/non_blocking,
-      /*out=*/out_nullopt);
-  Tensor out_contiguous_ret = op_copy_out(
-      /*self=*/self,
-      /*src=*/src,
-      /*non_blocking=*/non_blocking,
-      /*out=*/out_contiguous);
-
-  // The original tensor a should share same value with the out variable and
-  // return variable of copy function
-  EXPECT_TENSOR_EQ(src, out_nullopt);
-  EXPECT_TENSOR_EQ(src, out_nullopt_ret);
-
-  EXPECT_TENSOR_EQ(src, out_contiguous);
-  EXPECT_TENSOR_EQ(src, out_contiguous_ret);
-}
-
-TEST(OpCopyTest, AllRealDtypesSupported) {
+TEST_F(OpCopyTest, AllRealDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-template <class CTYPE, ScalarType DTYPE>
-void test_empty_input() {
-  TensorFactory<DTYPE> tf;
-  Tensor self = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
-  Tensor src = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
-  bool non_blocking = false;
-  Tensor out = tf.zeros({3, 0, 1, 2});
-  op_copy_out(self, src, non_blocking, out);
-  // check a and out share same value, but are different object
-  EXPECT_TENSOR_EQ(src, out);
-}
-
-TEST(OpCopyTest, EmptyInputSupported) {
+TEST_F(OpCopyTest, EmptyInputSupported) {
 #define TEST_ENTRY(ctype, dtype) test_empty_input<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpCopyTest, BroadCastSrcSupported) {
+TEST_F(OpCopyTest, BroadCastSrcSupported) {
   TensorFactory<ScalarType::Int> tf;
   Tensor self = tf.make(/*sizes=*/{2, 2}, /*data=*/{1, 2, 3, 4});
   Tensor src = tf.make(/*sizes=*/{1, 2}, /*data=*/{3, 3});
@@ -100,7 +146,7 @@ TEST(OpCopyTest, BroadCastSrcSupported) {
   EXPECT_TENSOR_EQ(out, out_expected);
 }
 
-TEST(OpCopyTest, BroadCastSrcMissingDimSupported) {
+TEST_F(OpCopyTest, BroadCastSrcMissingDimSupported) {
   TensorFactory<ScalarType::Int> tf;
   Tensor self = tf.make(/*sizes=*/{2, 2}, /*data=*/{1, 2, 3, 4});
   Tensor src = tf.make(/*sizes=*/{1, 2}, /*data=*/{3, 3});
@@ -111,16 +157,16 @@ TEST(OpCopyTest, BroadCastSrcMissingDimSupported) {
   EXPECT_TENSOR_EQ(out, out_expected);
 }
 
-TEST(OpCopyTest, BroadCastSelfcSupportedDie) {
+TEST_F(OpCopyTest, BroadCastSelfcSupportedDie) {
   TensorFactory<ScalarType::Int> tf;
   Tensor self = tf.make(/*sizes=*/{1, 2}, /*data=*/{3, 3});
   Tensor src = tf.make(/*sizes=*/{2, 2}, /*data=*/{1, 2, 3, 4});
   bool non_blocking = false;
   Tensor out = tf.zeros({2, 2});
-  ET_EXPECT_KERNEL_FAILURE(op_copy_out(self, src, non_blocking, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_copy_out(self, src, non_blocking, out));
 }
 
-TEST(OpCopyTest, MismatchSelfSrcTypeSupported) {
+TEST_F(OpCopyTest, MismatchSelfSrcTypeSupported) {
   TensorFactory<ScalarType::Int> tf_self;
   TensorFactory<ScalarType::Float> tf_src;
   Tensor self =
@@ -128,11 +174,11 @@ TEST(OpCopyTest, MismatchSelfSrcTypeSupported) {
   Tensor src = tf_src.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf_src.zeros({3, 0, 1, 2});
   bool non_blocking = false;
-  ET_EXPECT_KERNEL_FAILURE(op_copy_out(self, src, non_blocking, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_copy_out(self, src, non_blocking, out));
 }
 
 #ifndef USE_ATEN_LIB
-TEST(OpCopyTest, ResizeOutSupported) {
+TEST_F(OpCopyTest, ResizeOutSupported) {
   TensorFactory<ScalarType::Int> tf;
   Tensor self = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor src = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
@@ -145,7 +191,7 @@ TEST(OpCopyTest, ResizeOutSupported) {
   EXPECT_TENSOR_EQ(out, out_expected);
 }
 
-TEST(OpCopyTest, ResizeOutDie) {
+TEST_F(OpCopyTest, ResizeOutDie) {
   TensorFactory<ScalarType::Int> tf_self;
   TensorFactory<ScalarType::Float> tf_src;
   Tensor self =
@@ -154,11 +200,11 @@ TEST(OpCopyTest, ResizeOutDie) {
   Tensor out = tf_src.zeros(
       {3, 2, 0}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
   bool non_blocking = false;
-  ET_EXPECT_KERNEL_FAILURE(op_copy_out(self, src, non_blocking, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_copy_out(self, src, non_blocking, out));
 }
 #endif
 
-TEST(OpCopyTest, MismatchedSizesDie) {
+TEST_F(OpCopyTest, MismatchedSizesDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched sizes";
   }
@@ -167,23 +213,23 @@ TEST(OpCopyTest, MismatchedSizesDie) {
   Tensor src = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   bool non_blocking = false;
   Tensor out = tf.zeros({3, 2, 1, 1});
-  ET_EXPECT_KERNEL_FAILURE(op_copy_out(self, src, non_blocking, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_copy_out(self, src, non_blocking, out));
 }
 
-TEST(OpCopyTest, MismatchedSrcOutTypesDie) {
+TEST_F(OpCopyTest, MismatchedSrcOutTypesDie) {
   TensorFactory<ScalarType::Int> tf_in;
   TensorFactory<ScalarType::Float> tf_out;
   Tensor self = tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor src = tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   bool non_blocking = false;
   Tensor out = tf_out.zeros({3, 1, 1, 2});
-  ET_EXPECT_KERNEL_FAILURE(op_copy_out(self, src, non_blocking, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_copy_out(self, src, non_blocking, out));
 }
 
 // Only contiguous memory is supported, the memory type other than nullopt or
 // MemoryFormat::Contiguous should not be allowed. The function is expected
 // depth if using the illegal memory format.
-TEST(OpCopyTest, BlockingDie) {
+TEST_F(OpCopyTest, BlockingDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle non-contiguous memory formats";
   }
@@ -193,52 +239,15 @@ TEST(OpCopyTest, BlockingDie) {
   Tensor src = tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   bool non_blocking = true;
   Tensor out = tf_out.zeros({3, 1, 1, 2});
-  ET_EXPECT_KERNEL_FAILURE(op_copy_out(self, src, non_blocking, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_copy_out(self, src, non_blocking, out));
 }
 
-/* %python
-import torch
-torch.manual_seed(0)
-self = torch.randint(10, (3, 4))
-src = torch.randint(10, (3, 4))
-non_blocking = False
-expected = src
-out_args = "out_shape, dynamism"
-
-copy_template = f"""
-  {declare_tensor_factory("ScalarType::Int", "tf")}
-
-  {declare_tensor_make_t("self", "tf")}
-  {declare_tensor_make_t("src", "tf")}
-  {declare_tensor_make_t("expected", "tf")}
-  {declare_tensor_zeros("out_shape, dynamism", "tf", "out")}
-
-  op_copy_out(self, src, $non_blocking$, out);
-  EXPECT_TENSOR_EQ(out, expected);""" */
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(copy_template) */
-
-  TensorFactory<ScalarType::Int> tf;
-
-  Tensor self = tf.make({3, 4}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6});
-  Tensor src = tf.make({3, 4}, {6, 9, 8, 6, 6, 8, 4, 3, 6, 9, 1, 4});
-  Tensor expected = tf.make({3, 4}, {6, 9, 8, 6, 6, 8, 4, 3, 6, 9, 1, 4});
-  Tensor out = tf.zeros(out_shape, dynamism);
-
-  op_copy_out(self, src, false, out);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpCopyTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpCopyTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpCopyTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpCopyTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -246,10 +255,30 @@ TEST(OpCopyTest, DynamicShapeUpperBoundLargerThanExpected) {
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpCopyTest, DynamicShapeUnbound) {
+TEST_F(OpCopyTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
   test_dynamic_shape(
       {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
 }
+
+TEST_F(OpCopyInplaceTest, SmokeTest) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor in = tf.zeros({2, 2});
+  Tensor src = tf.make(/*sizes=*/{2, 2}, /*data=*/{1, 2, 3, 4});
+  bool non_blocking = false;
+  op_copy_(in, src, non_blocking);
+  Tensor expected = tf.make(/*sizes=*/{2, 2}, /*data=*/{1, 2, 3, 4});
+  EXPECT_TENSOR_EQ(in, expected);
+}
+
+TEST_F(OpCopyInplaceTest, BroadCastSrcSupported) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor in = tf.make(/*sizes=*/{2, 2}, /*data=*/{1, 2, 3, 4});
+  Tensor src = tf.make(/*sizes=*/{1, 2}, /*data=*/{3, 3});
+  bool non_blocking = false;
+  op_copy_(in, src, non_blocking);
+  Tensor expected = tf.make(/*sizes=*/{2, 2}, /*data=*/{3, 3, 3, 3});
+  EXPECT_TENSOR_EQ(in, expected);
+}
diff --git a/kernels/test/op_cos_test.cpp b/kernels/test/op_cos_test.cpp
index 3926bc5083..f6105787a8 100644
--- a/kernels/test/op_cos_test.cpp
+++ b/kernels/test/op_cos_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_cos_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::cos_outf(context, self, out);
-}
+class OpCosOutTest : public OperatorTest {
+ protected:
+  Tensor& op_cos_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::cos_outf(context_, self, out);
+  }
+
+  // Common testing for cos operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_cos_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the cos operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_cos_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 6}, { 1.000000,  0.540302, -0.989992,  0.283662, -0.839072,  0.862319 }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_cos_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_cos_out(in, out));
+  }
+};
 
-TEST(OpCosOutKernelTest, HandleBoolInput) {
+TEST_F(OpCosOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpCosOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_cos_out(a, out), res);
 }
 
-// Common testing for cos operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_cos_out(
-    const std::vector<int32_t>& out_shape = {1, 6},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the cos operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_cos_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 6}, { 1.000000,  0.540302, -0.989992,  0.283662, -0.839072,  0.862319 }));
-  // clang-format on
-}
-
-TEST(OpCosOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpCosOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpCosOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCosOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpCosOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_cos_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpCosOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpCosOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_cos_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpCosOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpCosOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpCosOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCosOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpCosOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                     \
   test_floating_point_cos_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpCosOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCosOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpCosOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                      \
   test_floating_point_cos_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpCosOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCosOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpCosOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpCosOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCosOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpCosOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpCosOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_cos_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_cos_out(in, out));
-}
-
-TEST(OpCosOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpCosOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_cos_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpCosOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpCosOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpCosOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpCosOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_cos_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_cos_out(a, out));
 }
diff --git a/kernels/test/op_cosh_test.cpp b/kernels/test/op_cosh_test.cpp
index de7a7b0c07..d9939a8193 100644
--- a/kernels/test/op_cosh_test.cpp
+++ b/kernels/test/op_cosh_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_cosh_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::cosh_outf(context, self, out);
-}
+class OpCoshOutTest : public OperatorTest {
+ protected:
+  Tensor& op_cosh_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::cosh_outf(context_, self, out);
+  }
+
+  // Common testing for cosh operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_cosh_out(
+      const std::vector<int32_t>& out_shape = {1, 5},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the cosh operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_cosh_out(tf_in.make({1, 5}, { 0, 1, 3, 5, 10 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 5}, { 1.000000e+00, 1.543081e+00, 1.006766e+01, 7.420995e+01, 1.101323e+04 }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_cosh_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_cosh_out(in, out));
+  }
+};
 
-TEST(OpCoshOutKernelTest, HandleBoolInput) {
+TEST_F(OpCoshOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpCoshOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_cosh_out(a, out), res);
 }
 
-// Common testing for cosh operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_cosh_out(
-    const std::vector<int32_t>& out_shape = {1, 5},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the cosh operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_cosh_out(tf_in.make({1, 5}, { 0, 1, 3, 5, 10 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 5}, { 1.000000e+00, 1.543081e+00, 1.006766e+01, 7.420995e+01, 1.101323e+04 }));
-  // clang-format on
-}
-
-TEST(OpCoshOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpCoshOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpCoshOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCoshOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpCoshOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpCoshOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpCoshOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpCoshOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpCoshOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpCoshOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCoshOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpCoshOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                      \
   test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpCoshOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCoshOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpCoshOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                       \
   test_floating_point_cosh_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpCoshOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCoshOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpCoshOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpCoshOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpCoshOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpCoshOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpCoshOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_cosh_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_cosh_out(in, out));
-}
-
-TEST(OpCoshOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpCoshOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_cosh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpCoshOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpCoshOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpCoshOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpCoshOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_cosh_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_cosh_out(a, out));
 }
diff --git a/kernels/test/op_cumsum_test.cpp b/kernels/test/op_cumsum_test.cpp
index cf21b16379..47dce0c93a 100644
--- a/kernels/test/op_cumsum_test.cpp
+++ b/kernels/test/op_cumsum_test.cpp
@@ -23,17 +23,87 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_cumsum_out(
-    const Tensor& self,
-    int64_t dim,
-    optional<ScalarType> enforced_dtype,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::cumsum_outf(
-      context, self, dim, enforced_dtype, out);
-}
+class OpCumSumOutTest : public OperatorTest {
+ protected:
+  Tensor& op_cumsum_out(
+      const Tensor& self,
+      int64_t dim,
+      optional<ScalarType> enforced_dtype,
+      Tensor& out) {
+    return torch::executor::aten::cumsum_outf(
+        context_, self, dim, enforced_dtype, out);
+  }
+
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_cumsum_out_dtype() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+    // clang-format off
+    Tensor in = tf_in.make(
+      {2, 4},
+      {
+        0, 1,  2,  4,
+        8, 16, 32, 64
+      });
+    // clang-format on
+
+    Tensor out = tf_out.zeros({2, 4});
+    optional<ScalarType> enforced_dtype = OUT_DTYPE;
+    op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
+
+    // clang-format off
+    Tensor expected = tf_out.make(
+      {2, 4},
+      {
+        0, 1,  3,  7,
+        8, 24, 56, 120
+      });
+    // clang-format on
+
+    EXPECT_TENSOR_CLOSE(out, expected);
+
+    // negative dim should work
+    op_cumsum_out(in, /*dim=*/-1, enforced_dtype, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+
+    op_cumsum_out(in, /*dim=*/0, enforced_dtype, out);
+    // clang-format off
+    expected = tf_out.make(
+      {2, 4},
+      {
+        0, 1,  2,  4,
+        8, 17, 34, 68
+      });
+    // clang-format on
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+
+  template <ScalarType OUT_DTYPE>
+  void test_cumsum_out_float() {
+    TensorFactory<ScalarType::Float> tf_float;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    Tensor in = tf_float.make({1, 2}, {1, INFINITY});
+    Tensor out = tf_out.zeros({1, 2});
+    optional<ScalarType> enforced_dtype = OUT_DTYPE;
+    op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 2}, {1, INFINITY}));
+
+    in = tf_float.make({1, 2}, {1, -INFINITY});
+    op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 2}, {1, -INFINITY}));
 
-TEST(OpCumSumOutTest, MismatchedDimensionsDies) {
+    in = tf_float.make({1, 2}, {1, NAN});
+    op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 2}, {1, NAN}));
+
+    in = tf_float.make({1, 2}, {-INFINITY, INFINITY});
+    op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 2}, {-INFINITY, NAN}));
+  }
+};
+
+TEST_F(OpCumSumOutTest, MismatchedDimensionsDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched dimensions";
   }
@@ -46,62 +116,19 @@ TEST(OpCumSumOutTest, MismatchedDimensionsDies) {
 
   // Dim out of bounds
   optional<ScalarType> enforced_dtype;
-  ET_EXPECT_KERNEL_FAILURE(op_cumsum_out(in, /*dim=*/3, enforced_dtype, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_cumsum_out(in, /*dim=*/3, enforced_dtype, out));
 
   // wrong_out has incompatible dim
   Tensor wrong_out = tff.zeros({2, 10, 4});
   ET_EXPECT_KERNEL_FAILURE(
-      op_cumsum_out(in, /*dim=*/1, enforced_dtype, wrong_out));
+      context_, op_cumsum_out(in, /*dim=*/1, enforced_dtype, wrong_out));
 }
 
 /* A generic smoke test that works for the supported dtypes with
  * enforced_dtype specified.
  */
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_cumsum_out_dtype() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-  // clang-format off
-  Tensor in = tf_in.make(
-    {2, 4},
-    {
-      0, 1,  2,  4,
-      8, 16, 32, 64
-    });
-  // clang-format on
-
-  Tensor out = tf_out.zeros({2, 4});
-  optional<ScalarType> enforced_dtype = OUT_DTYPE;
-  op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
-
-  // clang-format off
-  Tensor expected = tf_out.make(
-    {2, 4},
-    {
-      0, 1,  3,  7,
-      8, 24, 56, 120
-    });
-  // clang-format on
-
-  EXPECT_TENSOR_CLOSE(out, expected);
-
-  // negative dim should work
-  op_cumsum_out(in, /*dim=*/-1, enforced_dtype, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-
-  op_cumsum_out(in, /*dim=*/0, enforced_dtype, out);
-  // clang-format off
-  expected = tf_out.make(
-    {2, 4},
-    {
-      0, 1,  2,  4,
-      8, 17, 34, 68
-    });
-  // clang-format on
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
-TEST(OpCumSumOutTest, EnforcedDtypePasses) {
+TEST_F(OpCumSumOutTest, EnforcedDtypePasses) {
 // Use a two layer switch to hanldle each possible data pair
 #define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
   test_cumsum_out_dtype<ScalarType::INPUT_DTYPE, ScalarType::OUTPUT_DTYPE>();
@@ -114,7 +141,7 @@ TEST(OpCumSumOutTest, EnforcedDtypePasses) {
 #undef TEST_KERNEL
 }
 
-TEST(OpCumSumOutTest, TypeCastCornerCases) {
+TEST_F(OpCumSumOutTest, TypeCastCornerCases) {
   TensorFactory<ScalarType::Int> tf_int;
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Byte> tf_byte;
@@ -144,31 +171,7 @@ TEST(OpCumSumOutTest, TypeCastCornerCases) {
 /* A generic smoke test that works for the supported dtypes with
  * enforced_dtype specified.
  */
-template <ScalarType OUT_DTYPE>
-void test_cumsum_out_float() {
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  Tensor in = tf_float.make({1, 2}, {1, INFINITY});
-  Tensor out = tf_out.zeros({1, 2});
-  optional<ScalarType> enforced_dtype = OUT_DTYPE;
-  op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 2}, {1, INFINITY}));
-
-  in = tf_float.make({1, 2}, {1, -INFINITY});
-  op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 2}, {1, -INFINITY}));
-
-  in = tf_float.make({1, 2}, {1, NAN});
-  op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 2}, {1, NAN}));
-
-  in = tf_float.make({1, 2}, {-INFINITY, INFINITY});
-  op_cumsum_out(in, /*dim=*/1, enforced_dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 2}, {-INFINITY, NAN}));
-}
-
-TEST(OpCumSumOutTest, FloatSpecificTest) {
+TEST_F(OpCumSumOutTest, FloatSpecificTest) {
 // Float/double specific +/-Inf and NAN test
 #define TEST_ENTRY_FLOAT_SPECIFIC_CASES(ctype, dtype) \
   test_cumsum_out_float<ScalarType::dtype>();
@@ -176,7 +179,7 @@ TEST(OpCumSumOutTest, FloatSpecificTest) {
 #undef TEST_ENTRY_FLOAT_SPECIFIC_CASES
 }
 
-TEST(OpCumSumOutTest, SimpleGeneratedCase) {
+TEST_F(OpCumSumOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -205,7 +208,7 @@ TEST(OpCumSumOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpCumSumOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpCumSumOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -231,7 +234,7 @@ TEST(OpCumSumOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpCumSumOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpCumSumOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -257,7 +260,7 @@ TEST(OpCumSumOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpCumSumOutTest, DynamicShapeUnbound) {
+TEST_F(OpCumSumOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_detach_copy_test.cpp b/kernels/test/op_detach_copy_test.cpp
index 4eeb1b50be..8aab9e7d1c 100644
--- a/kernels/test/op_detach_copy_test.cpp
+++ b/kernels/test/op_detach_copy_test.cpp
@@ -22,70 +22,73 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_detach_copy_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::detach_copy_outf(context, self, out);
-}
-
-// Common testing for eq operator
-template <ScalarType DTYPE>
-void test_detach_copy_out() {
-  TensorFactory<DTYPE> tf;
-  const std::vector<int32_t> sizes = {2, 2};
+class OpDetachCopyOutTest : public OperatorTest {
+ protected:
+  Tensor& op_detach_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::detach_copy_outf(context_, self, out);
+  }
 
-  Tensor in = tf.make(sizes, {1, 2, 3, 4});
-  Tensor out = tf.zeros(sizes);
+  // Common testing for eq operator
+  template <ScalarType DTYPE>
+  void test_detach_copy_out() {
+    TensorFactory<DTYPE> tf;
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Valid input should give the expected output
-  op_detach_copy_out(in, out);
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, {1, 2, 3, 4}));
-}
+    Tensor in = tf.make(sizes, {1, 2, 3, 4});
+    Tensor out = tf.zeros(sizes);
 
-template <>
-void test_detach_copy_out<ScalarType::Bool>() {
-  TensorFactory<ScalarType::Bool> tf;
-  const std::vector<int32_t> sizes = {2, 2};
-  Tensor out = tf.zeros(sizes);
+    // Valid input should give the expected output
+    op_detach_copy_out(in, out);
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, {1, 2, 3, 4}));
+  }
 
-  // Valid input should give the expected output
-  op_detach_copy_out(tf.make(sizes, /*data=*/{true, false, true, false}), out);
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{true, false, true, false}));
-}
+  template <>
+  void test_detach_copy_out<ScalarType::Bool>() {
+    TensorFactory<ScalarType::Bool> tf;
+    const std::vector<int32_t> sizes = {2, 2};
+    Tensor out = tf.zeros(sizes);
 
-template <>
-void test_detach_copy_out<ScalarType::Float>() {
-  TensorFactory<ScalarType::Float> tf;
-  const std::vector<int32_t> sizes = {2, 2};
-  Tensor out = tf.zeros(sizes);
+    // Valid input should give the expected output
+    op_detach_copy_out(
+        tf.make(sizes, /*data=*/{true, false, true, false}), out);
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{true, false, true, false}));
+  }
 
-  // Valid input should give the expected output
-  op_detach_copy_out(
-      tf.make(sizes, /*data=*/{3.14, INFINITY, -INFINITY, NAN}), out);
-  EXPECT_TENSOR_EQ(
-      out, tf.make(sizes, /*data=*/{3.14, INFINITY, -INFINITY, NAN}));
-}
+  template <>
+  void test_detach_copy_out<ScalarType::Float>() {
+    TensorFactory<ScalarType::Float> tf;
+    const std::vector<int32_t> sizes = {2, 2};
+    Tensor out = tf.zeros(sizes);
+
+    // Valid input should give the expected output
+    op_detach_copy_out(
+        tf.make(sizes, /*data=*/{3.14, INFINITY, -INFINITY, NAN}), out);
+    EXPECT_TENSOR_EQ(
+        out, tf.make(sizes, /*data=*/{3.14, INFINITY, -INFINITY, NAN}));
+  }
 
-TEST(OpDetachCopyOutKernelTest, AllScalarInputOutputSupport) {
-#define TEST_ENTRY(ctype, dtype) test_detach_copy_out<ScalarType::dtype>();
-  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
-#undef TEST_ENTRY
-}
+  template <ScalarType DTYPE>
+  void test_detach_copy_out_invalid_shape() {
+    TensorFactory<DTYPE> tf;
 
-template <ScalarType DTYPE>
-void test_detach_copy_out_invalid_shape() {
-  TensorFactory<DTYPE> tf;
+    const std::vector<int32_t> in_sizes = {2, 2};
+    const std::vector<int32_t> out_sizes = {4};
 
-  const std::vector<int32_t> in_sizes = {2, 2};
-  const std::vector<int32_t> out_sizes = {4};
+    Tensor in = tf.ones(in_sizes);
+    Tensor out = tf.zeros(out_sizes);
 
-  Tensor in = tf.ones(in_sizes);
-  Tensor out = tf.zeros(out_sizes);
+    ET_EXPECT_KERNEL_FAILURE(context_, op_detach_copy_out(in, out));
+  }
+};
 
-  ET_EXPECT_KERNEL_FAILURE(op_detach_copy_out(in, out));
+TEST_F(OpDetachCopyOutTest, AllScalarInputOutputSupport) {
+#define TEST_ENTRY(ctype, dtype) test_detach_copy_out<ScalarType::dtype>();
+  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+#undef TEST_ENTRY
 }
 
 // Mismatched shape tests.
-TEST(OpDetachCopyOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpDetachCopyOutTest, MismatchedShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -95,7 +98,7 @@ TEST(OpDetachCopyOutKernelTest, MismatchedShapesDies) {
 #undef TEST_ENTRY
 }
 
-TEST(OpDetachCopyOutKernelTest, MismatchedInputDtypesDies) {
+TEST_F(OpDetachCopyOutTest, MismatchedInputDtypesDies) {
   TensorFactory<ScalarType::Byte> tf_byte;
   TensorFactory<ScalarType::Char> tf_char;
 
@@ -104,10 +107,10 @@ TEST(OpDetachCopyOutKernelTest, MismatchedInputDtypesDies) {
   Tensor in = tf_byte.ones(sizes);
   Tensor out = tf_char.ones(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_detach_copy_out(in, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_detach_copy_out(in, out));
 }
 
-TEST(OpDetachCopyOutKernelTest, SimpleGeneratedCase) {
+TEST_F(OpDetachCopyOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -136,7 +139,7 @@ TEST(OpDetachCopyOutKernelTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpDetachCopyOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpDetachCopyOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -162,7 +165,7 @@ TEST(OpDetachCopyOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpDetachCopyOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpDetachCopyOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -188,7 +191,7 @@ TEST(OpDetachCopyOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpDetachCopyOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpDetachCopyOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_div_test.cpp b/kernels/test/op_div_test.cpp
index 151a9e66a3..2e8f66c76d 100644
--- a/kernels/test/op_div_test.cpp
+++ b/kernels/test/op_div_test.cpp
@@ -24,126 +24,145 @@ using torch::executor::testing::TensorFactory;
 
 namespace {
 
-Tensor& op_div_out(const Tensor& a, const Tensor& b, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::div_outf(context, a, b, out);
-}
-
-Tensor& op_div_scalar_out(const Tensor& a, const Scalar& b, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::div_outf(context, a, b, out);
-}
-
-} // namespace
-
-//
-// Correctness Tests
-//
-
-/**
- * Common testing for div operator, for float output types
- */
-template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
-void test_div() {
-  TensorFactory<DTYPE_A> tf_a;
-  TensorFactory<DTYPE_B> tf_b;
-  TensorFactory<DTYPE_OUT> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 2};
+class OpDivOutTest : public OperatorTest {
+ protected:
+  Tensor& op_div_out(const Tensor& a, const Tensor& b, Tensor& out) {
+    return torch::executor::aten::div_outf(context_, a, b, out);
+  }
 
-  Tensor out = tf_out.zeros(sizes);
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
+  void test_div() {
+    TensorFactory<DTYPE_A> tf_a;
+    TensorFactory<DTYPE_B> tf_b;
+    TensorFactory<DTYPE_OUT> tf_out;
 
-  // Valid input should give the expected output
-  op_div_out(
-      tf_a.make(sizes, /*data=*/{1, 2, 4, 8}),
-      tf_b.make(sizes, /*data=*/{8, 4, 2, 1}),
-      out);
+    const std::vector<int32_t> sizes = {2, 2};
 
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(sizes, /*data=*/{0.125, 0.5, 2, 8}));
-}
+    Tensor out = tf_out.zeros(sizes);
 
-template <>
-void test_div<ScalarType::Float, ScalarType::Float, ScalarType::Float>() {
-  TensorFactory<ScalarType::Float> tf;
+    // Valid input should give the expected output
+    op_div_out(
+        tf_a.make(sizes, /*data=*/{1, 2, 4, 8}),
+        tf_b.make(sizes, /*data=*/{8, 4, 2, 1}),
+        out);
 
-  const std::vector<int32_t> sizes = {2, 5};
-
-  // Invalid divisor input zero should die
-  Tensor out = tf.zeros(sizes);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(sizes, /*data=*/{0.125, 0.5, 2, 8}));
+  }
 
-  // Valid input should give the expected output
-  op_div_out(
-      tf.make(sizes, /*data=*/{1, 2, 4, 8, INFINITY, -INFINITY, NAN, 1, 1, 1}),
-      tf.make(
-          sizes,
-          /*data=*/
-          {8, 0, 2, 1, INFINITY, -INFINITY, NAN, INFINITY, -INFINITY, NAN}),
-      out);
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf.make(
-          sizes, /*data=*/{0.125, INFINITY, 2, 8, NAN, NAN, NAN, 0, 0, NAN}));
-}
+  template <>
+  void test_div<ScalarType::Float, ScalarType::Float, ScalarType::Float>() {
+    TensorFactory<ScalarType::Float> tf;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    // Invalid divisor input zero should die
+    Tensor out = tf.zeros(sizes);
+
+    // Valid input should give the expected output
+    op_div_out(
+        tf.make(
+            sizes, /*data=*/{1, 2, 4, 8, INFINITY, -INFINITY, NAN, 1, 1, 1}),
+        tf.make(
+            sizes,
+            /*data=*/
+            {8, 0, 2, 1, INFINITY, -INFINITY, NAN, INFINITY, -INFINITY, NAN}),
+        out);
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf.make(
+            sizes, /*data=*/{0.125, INFINITY, 2, 8, NAN, NAN, NAN, 0, 0, NAN}));
+  }
 
-template <>
-void test_div<ScalarType::Bool, ScalarType::Float, ScalarType::Float>() {
-  TensorFactory<ScalarType::Bool> tf_b;
-  TensorFactory<ScalarType::Float> tf;
+  template <>
+  void test_div<ScalarType::Bool, ScalarType::Float, ScalarType::Float>() {
+    TensorFactory<ScalarType::Bool> tf_b;
+    TensorFactory<ScalarType::Float> tf;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Invalid divisor input zero should die
-  Tensor out = tf.zeros(sizes);
+    // Invalid divisor input zero should die
+    Tensor out = tf.zeros(sizes);
 
-  // Valid input should give the expected output
-  op_div_out(
-      tf_b.make(sizes, /*data=*/{1, 1, 1, 1}),
-      tf.make(sizes, /*data=*/{4, 4, 2, 1}),
-      out);
+    // Valid input should give the expected output
+    op_div_out(
+        tf_b.make(sizes, /*data=*/{1, 1, 1, 1}),
+        tf.make(sizes, /*data=*/{4, 4, 2, 1}),
+        out);
 
-  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.25, 0.25, 0.5, 1.0}));
-}
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.25, 0.25, 0.5, 1.0}));
+  }
 
-template <ScalarType DTYPE_A, ScalarType DTYPE_B>
-void test_div_enumerate_out_types() {
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B>
+  void test_div_enumerate_out_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_div<DTYPE_A, DTYPE_B, ScalarType::dtype>();
 
-  ET_FORALL_FLOAT_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_FLOAT_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
 
-template <ScalarType DTYPE_A>
-void test_div_enumerate_b_types() {
+  template <ScalarType DTYPE_A>
+  void test_div_enumerate_b_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_div_enumerate_out_types<DTYPE_A, ScalarType::dtype>();
 
-  ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
+
+  template <ScalarType OUTPUT_DTYPE>
+  void test_div_invalid_output_dtype_dies() {
+    TensorFactory<ScalarType::Float> tf_float;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
 
-void test_div_enumerate_a_types() {
+    Tensor a = tf_float.ones(sizes);
+    Tensor b = tf_float.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_div_out(a, b, out));
+  }
+
+  /**
+   * Common testing for div operator, for float output types
+   */
+  void test_div_enumerate_a_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_div_enumerate_b_types<ScalarType::dtype>();
 
-  ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
 
-  test_div<ScalarType::Bool, ScalarType::Float, ScalarType::Float>();
+    test_div<ScalarType::Bool, ScalarType::Float, ScalarType::Float>();
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
+};
+
+class OpDivScalarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_div_scalar_out(const Tensor& a, const Scalar& b, Tensor& out) {
+    return torch::executor::aten::div_outf(context_, a, b, out);
+  }
+};
+
+}; // namespace
+
+//
+// Correctness Tests
+//
 
 /**
  * Uses the function templates above to test all valid combinations of inputs
  * and output dtypes
  */
-TEST(OpDivOutKernelTest, AllRealDtypesSupported) {
+TEST_F(OpDivOutTest, AllRealDtypesSupported) {
   test_div_enumerate_a_types();
 }
 
-TEST(OpDivOutKernelTest, BroadcastSupported1) {
+TEST_F(OpDivOutTest, BroadcastSupported1) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.make({2, 1, 2, 1}, {4, 8, 12, 16});
@@ -161,7 +180,7 @@ TEST(OpDivOutKernelTest, BroadcastSupported1) {
   EXPECT_TENSOR_EQ(out, ret);
 }
 
-TEST(OpDivOutKernelTest, BroadcastSupported2) {
+TEST_F(OpDivOutTest, BroadcastSupported2) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.make({3, 2, 1}, {2, 3, 4, 5, 6, 7});
@@ -177,7 +196,7 @@ TEST(OpDivOutKernelTest, BroadcastSupported2) {
   EXPECT_TENSOR_EQ(out, ret);
 }
 
-TEST(OpDivOutKernelTest, BroadcastScalarSupported1) {
+TEST_F(OpDivOutTest, BroadcastScalarSupported1) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.make({2, 1, 3}, {2, 3, 4, 5, 6, 7});
@@ -193,7 +212,7 @@ TEST(OpDivOutKernelTest, BroadcastScalarSupported1) {
   EXPECT_TENSOR_EQ(out, ret);
 }
 
-TEST(OpDivOutKernelTest, BroadcastScalarSupported2) {
+TEST_F(OpDivOutTest, BroadcastScalarSupported2) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.make({1, 1, 1}, {8});
@@ -209,7 +228,7 @@ TEST(OpDivOutKernelTest, BroadcastScalarSupported2) {
   EXPECT_TENSOR_EQ(out, ret);
 }
 
-TEST(OpDivOutKernelTest, BroadcastDimSizeIsOneAB) {
+TEST_F(OpDivOutTest, BroadcastDimSizeIsOneAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -235,7 +254,7 @@ TEST(OpDivOutKernelTest, BroadcastDimSizeIsOneAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpDivOutKernelTest, BroadcastDimSizeMissingAB) {
+TEST_F(OpDivOutTest, BroadcastDimSizeMissingAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -261,7 +280,7 @@ TEST(OpDivOutKernelTest, BroadcastDimSizeMissingAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpDivOutKernelTest, BroadcastDimSizeIsOneBA) {
+TEST_F(OpDivOutTest, BroadcastDimSizeIsOneBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.942541241645813, 0.0298004150390625});
@@ -287,7 +306,7 @@ TEST(OpDivOutKernelTest, BroadcastDimSizeIsOneBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpDivOutKernelTest, BroadcastDimSizeMissingBA) {
+TEST_F(OpDivOutTest, BroadcastDimSizeMissingBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.942541241645813, 0.0298004150390625});
@@ -317,7 +336,7 @@ TEST(OpDivOutKernelTest, BroadcastDimSizeMissingBA) {
 // Death Tests
 //
 
-TEST(OpDivOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpDivOutTest, MismatchedShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -328,24 +347,10 @@ TEST(OpDivOutKernelTest, MismatchedShapesDies) {
   Tensor b = tf_int.ones(/*sizes=*/{4});
   Tensor out = tf_float.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_div_out(a, b, out));
-}
-
-template <ScalarType OUTPUT_DTYPE>
-void test_div_invalid_output_dtype_dies() {
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor a = tf_float.ones(sizes);
-  Tensor b = tf_float.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_div_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_div_out(a, b, out));
 }
 
-TEST(OpDivOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpDivOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_div_invalid_output_dtype_dies<ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -356,7 +361,7 @@ TEST(OpDivOutKernelTest, AllNonFloatOutputDTypeDies) {
 // Dynamic Shape Tests
 //
 
-TEST(OpDivOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpDivOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -390,7 +395,7 @@ TEST(OpDivOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpDivOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpDivOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -424,7 +429,7 @@ TEST(OpDivOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpDivOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpDivOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -459,7 +464,7 @@ TEST(OpDivOutKernelTest, DynamicShapeUnbound) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpDivScalarOutKernelTest, SanityCheckIntScalar) {
+TEST_F(OpDivScalarOutTest, SanityCheckIntScalar) {
   TensorFactory<ScalarType::Int> tf_a;
   TensorFactory<ScalarType::Float> tf_out;
 
@@ -473,7 +478,7 @@ TEST(OpDivScalarOutKernelTest, SanityCheckIntScalar) {
   EXPECT_TENSOR_EQ(out, tf_out.make(sizes, {0.5, 1.0, 2.0, -4.5}));
 }
 
-TEST(OpDivScalarOutKernelTest, SanityCheckFloatScalar) {
+TEST_F(OpDivScalarOutTest, SanityCheckFloatScalar) {
   TensorFactory<ScalarType::Int> tf_a;
   TensorFactory<ScalarType::Float> tf_out;
 
@@ -487,7 +492,7 @@ TEST(OpDivScalarOutKernelTest, SanityCheckFloatScalar) {
   EXPECT_TENSOR_EQ(out, tf_out.make(sizes, {0.5, 1.0, 2.0, -4.5}));
 }
 
-TEST(OpDivScalarOutKernelTest, OptimizedSanityCheck) {
+TEST_F(OpDivScalarOutTest, OptimizedSanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
diff --git a/kernels/test/op_embedding_test.cpp b/kernels/test/op_embedding_test.cpp
index 251de22221..6c48996a4b 100644
--- a/kernels/test/op_embedding_test.cpp
+++ b/kernels/test/op_embedding_test.cpp
@@ -22,19 +22,97 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_embedding_out(
-    const Tensor& weight,
-    const Tensor& indices,
-    int64_t padding_idx,
-    bool scale_grad_by_freq,
-    bool sparse,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::embedding_outf(
-      context, weight, indices, padding_idx, scale_grad_by_freq, sparse, out);
-}
+class OpEmbeddingOutTest : public OperatorTest {
+ protected:
+  Tensor& op_embedding_out(
+      const Tensor& weight,
+      const Tensor& indices,
+      int64_t padding_idx,
+      bool scale_grad_by_freq,
+      bool sparse,
+      Tensor& out) {
+    return torch::executor::aten::embedding_outf(
+        context_,
+        weight,
+        indices,
+        padding_idx,
+        scale_grad_by_freq,
+        sparse,
+        out);
+  }
+
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<ScalarType::Long> tfl;
+    // clang-format off
+    Tensor weight = tf.make(
+      {3, 2},
+      {
+        1, 2,
+        3, 4,
+        5, 6,
+      });
+    Tensor indices = tfl.make(
+      {1, 2},
+      {0, 2}
+    );
+    // clang-format on
+    Tensor out = tf.zeros({1, 2, 2});
+    Tensor actual = op_embedding_out(
+        weight,
+        indices,
+        /*padding_idx=*/0,
+        /*scale_grad_by_freq=*/false,
+        /*sparse=*/false,
+        out);
+
+    Tensor expected = tf.make({1, 2, 2}, {1, 2, 5, 6});
+
+    EXPECT_TENSOR_EQ(actual, out);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(embedding_template) */
+
+    TensorFactory<ScalarType::Float> tf_weight;
+    TensorFactory<ScalarType::Long> tf_indices;
+
+    Tensor weight = tf_weight.make(
+        {10, 3},
+        {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
+         0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
+         0.4900934100151062,   0.8964447379112244,  0.455627977848053,
+         0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
+         0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
+         0.518521785736084,    0.6976675987243652,  0.800011396408081,
+         0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
+         0.9151939749717712,   0.39709991216659546, 0.8741558790206909,
+         0.41940832138061523,  0.5529070496559143,  0.9527381062507629,
+         0.036164820194244385, 0.1852310299873352,  0.37341737747192383});
+    Tensor indices = tf_indices.make({2, 4}, {1, 2, 4, 5, 4, 3, 2, 9});
+    Tensor expected = tf_weight.make(
+        {2, 4, 3},
+        {0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
+         0.4900934100151062,   0.8964447379112244,  0.455627977848053,
+         0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
+         0.518521785736084,    0.6976675987243652,  0.800011396408081,
+         0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
+         0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
+         0.4900934100151062,   0.8964447379112244,  0.455627977848053,
+         0.036164820194244385, 0.1852310299873352,  0.37341737747192383});
+    Tensor out = tf_weight.zeros(out_shape, dynamism);
+
+    op_embedding_out(weight, indices, 0, false, false, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+};
 
-TEST(OpEmbeddingOutTest, Smoke) {
+TEST_F(OpEmbeddingOutTest, Smoke) {
   TensorFactory<ScalarType::Float> tff;
   // clang-format off
   Tensor weight = tff.make(
@@ -64,39 +142,7 @@ TEST(OpEmbeddingOutTest, Smoke) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<ScalarType::Long> tfl;
-  // clang-format off
-  Tensor weight = tf.make(
-    {3, 2},
-    {
-      1, 2,
-      3, 4,
-      5, 6,
-    });
-  Tensor indices = tfl.make(
-    {1, 2},
-    {0, 2}
-  );
-  // clang-format on
-  Tensor out = tf.zeros({1, 2, 2});
-  Tensor actual = op_embedding_out(
-      weight,
-      indices,
-      /*padding_idx=*/0,
-      /*scale_grad_by_freq=*/false,
-      /*sparse=*/false,
-      out);
-
-  Tensor expected = tf.make({1, 2, 2}, {1, 2, 5, 6});
-
-  EXPECT_TENSOR_EQ(actual, out);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpEmbeddingOutTest, AllDtypesSupported) {
+TEST_F(OpEmbeddingOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
@@ -105,7 +151,7 @@ TEST(OpEmbeddingOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpEmbeddingOutTest, IndicesMultiDims) {
+TEST_F(OpEmbeddingOutTest, IndicesMultiDims) {
   TensorFactory<ScalarType::Float> tff;
   // clang-format off
   Tensor weight = tff.make(
@@ -143,7 +189,7 @@ TEST(OpEmbeddingOutTest, IndicesMultiDims) {
   // clang-format on
 }
 
-TEST(OpEmbeddingOutTest, WeightWrongDimensionsDies) {
+TEST_F(OpEmbeddingOutTest, WeightWrongDimensionsDies) {
   TensorFactory<ScalarType::Float> tff;
   // clang-format off
   Tensor weight = tff.make(
@@ -160,16 +206,18 @@ TEST(OpEmbeddingOutTest, WeightWrongDimensionsDies) {
   // clang-format off
   Tensor indices = tfl.make({2, 2}, {1, 0, 2, 3});
   // clang-format on
-  ET_EXPECT_KERNEL_FAILURE(op_embedding_out(
-      weight,
-      indices,
-      /*padding_idx=*/0,
-      /*scale_grad_by_freq=*/false,
-      /*sparse=*/false,
-      out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_embedding_out(
+          weight,
+          indices,
+          /*padding_idx=*/0,
+          /*scale_grad_by_freq=*/false,
+          /*sparse=*/false,
+          out));
 }
 
-TEST(OpEmbeddingOutTest, WrongOutShapeDies) {
+TEST_F(OpEmbeddingOutTest, WrongOutShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle wrong out shape";
   }
@@ -194,17 +242,19 @@ TEST(OpEmbeddingOutTest, WrongOutShapeDies) {
 
   for (auto wrong_out: wrong_outs) {
     // clang-format on
-    ET_EXPECT_KERNEL_FAILURE(op_embedding_out(
-        weight,
-        indices,
-        /*padding_idx=*/0,
-        /*scale_grad_by_freq=*/false,
-        /*sparse=*/false,
-        wrong_out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_embedding_out(
+            weight,
+            indices,
+            /*padding_idx=*/0,
+            /*scale_grad_by_freq=*/false,
+            /*sparse=*/false,
+            wrong_out));
   }
 }
 
-TEST(OpEmbeddingOutTest, UnmatchedOutTypeDie) {
+TEST_F(OpEmbeddingOutTest, UnmatchedOutTypeDie) {
   TensorFactory<ScalarType::Float> tff;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
@@ -222,16 +272,18 @@ TEST(OpEmbeddingOutTest, UnmatchedOutTypeDie) {
   Tensor indices = tfl.make({2, 2}, {1, 0, 2, 3});
   // clang-format on
 
-  ET_EXPECT_KERNEL_FAILURE(op_embedding_out(
-      weight,
-      indices,
-      /*padding_idx=*/0,
-      /*scale_grad_by_freq=*/false,
-      /*sparse=*/false,
-      wrong_out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_embedding_out(
+          weight,
+          indices,
+          /*padding_idx=*/0,
+          /*scale_grad_by_freq=*/false,
+          /*sparse=*/false,
+          wrong_out));
 }
 
-TEST(OpEmbeddingOutTest, OutOfBoundIndicesDies) {
+TEST_F(OpEmbeddingOutTest, OutOfBoundIndicesDies) {
   TensorFactory<ScalarType::Float> tff;
   // clang-format off
   Tensor weight = tff.make(
@@ -250,24 +302,28 @@ TEST(OpEmbeddingOutTest, OutOfBoundIndicesDies) {
   Tensor neg_indices = tfl.make({2, 2}, {-1, 0, 2, 4});
   Tensor overflow_indices = tfl.make({2, 2}, {1, 0, 2, 8});
 
-  ET_EXPECT_KERNEL_FAILURE(op_embedding_out(
-      weight,
-      neg_indices,
-      /*padding_idx=*/0,
-      /*scale_grad_by_freq=*/false,
-      /*sparse=*/false,
-      out));
-
-  ET_EXPECT_KERNEL_FAILURE(op_embedding_out(
-      weight,
-      overflow_indices,
-      /*padding_idx=*/0,
-      /*scale_grad_by_freq=*/false,
-      /*sparse=*/false,
-      out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_embedding_out(
+          weight,
+          neg_indices,
+          /*padding_idx=*/0,
+          /*scale_grad_by_freq=*/false,
+          /*sparse=*/false,
+          out));
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_embedding_out(
+          weight,
+          overflow_indices,
+          /*padding_idx=*/0,
+          /*scale_grad_by_freq=*/false,
+          /*sparse=*/false,
+          out));
 }
 
-TEST(OpEmbeddingOutTest, EmptyWeightSupported) {
+TEST_F(OpEmbeddingOutTest, EmptyWeightSupported) {
   TensorFactory<ScalarType::Float> tff;
   // clang-format off
   Tensor weight = tff.make(
@@ -291,7 +347,7 @@ TEST(OpEmbeddingOutTest, EmptyWeightSupported) {
   EXPECT_TENSOR_EQ(actual, tff.zeros({2, 2, 0}));
 }
 
-TEST(OpEmbeddingOutTest, ZeroDimIndicesSupported) {
+TEST_F(OpEmbeddingOutTest, ZeroDimIndicesSupported) {
   TensorFactory<ScalarType::Float> tff;
   // clang-format off
   Tensor weight = tff.make(
@@ -328,7 +384,7 @@ TEST(OpEmbeddingOutTest, ZeroDimIndicesSupported) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpEmbeddingOutTest, EmptyDimIndicesSupported) {
+TEST_F(OpEmbeddingOutTest, EmptyDimIndicesSupported) {
   TensorFactory<ScalarType::Float> tff;
   // clang-format off
   Tensor weight = tff.make(
@@ -376,7 +432,7 @@ sparse = False
 expected = torch.nn.functional.embedding(
   indices, weight, padding_idx=padding, scale_grad_by_freq=scale, sparse=sparse
 )
-embedding_template = f"""
+embedding_ template = f"""
   {declare_tensor_factory("ScalarType::Float", "tf_weight")}
   {declare_tensor_factory("ScalarType::Long", "tf_indices")}
 
@@ -388,55 +444,17 @@ embedding_template = f"""
   op_embedding_out(weight, indices, $padding$, $scale$, $sparse$, out);
   EXPECT_TENSOR_CLOSE(out, expected);""" */
 
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(embedding_template) */
-
-  TensorFactory<ScalarType::Float> tf_weight;
-  TensorFactory<ScalarType::Long> tf_indices;
-
-  Tensor weight = tf_weight.make(
-      {10, 3},
-      {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
-       0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
-       0.4900934100151062,   0.8964447379112244,  0.455627977848053,
-       0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
-       0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
-       0.518521785736084,    0.6976675987243652,  0.800011396408081,
-       0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
-       0.9151939749717712,   0.39709991216659546, 0.8741558790206909,
-       0.41940832138061523,  0.5529070496559143,  0.9527381062507629,
-       0.036164820194244385, 0.1852310299873352,  0.37341737747192383});
-  Tensor indices = tf_indices.make({2, 4}, {1, 2, 4, 5, 4, 3, 2, 9});
-  Tensor expected = tf_weight.make(
-      {2, 4, 3},
-      {0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
-       0.4900934100151062,   0.8964447379112244,  0.455627977848053,
-       0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
-       0.518521785736084,    0.6976675987243652,  0.800011396408081,
-       0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
-       0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
-       0.4900934100151062,   0.8964447379112244,  0.455627977848053,
-       0.036164820194244385, 0.1852310299873352,  0.37341737747192383});
-  Tensor out = tf_weight.zeros(out_shape, dynamism);
-
-  op_embedding_out(weight, indices, 0, false, false, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
-TEST(OpEmbeddingOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpEmbeddingOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 4, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpEmbeddingOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpEmbeddingOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpEmbeddingOutTest, DynamicShapeUnbound) {
+TEST_F(OpEmbeddingOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_empty_test.cpp b/kernels/test/op_empty_test.cpp
index ea173ad0e6..5d40b97392 100644
--- a/kernels/test/op_empty_test.cpp
+++ b/kernels/test/op_empty_test.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
 #include <executorch/kernels/test/supported_features.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
@@ -23,27 +24,30 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_empty_out(
-    IntArrayRef size,
-    optional<MemoryFormat> memory_format,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::empty_outf(context, size, memory_format, out);
-}
+class OpEmptyOutTest : public OperatorTest {
+ protected:
+  Tensor& op_empty_out(
+      IntArrayRef size,
+      optional<MemoryFormat> memory_format,
+      Tensor& out) {
+    return torch::executor::aten::empty_outf(
+        context_, size, memory_format, out);
+  }
 
-template <ScalarType DTYPE>
-void test_empty_out(std::vector<int32_t>&& size_int32_t) {
-  TensorFactory<DTYPE> tf;
-  std::vector<int64_t> sizes(size_int32_t.begin(), size_int32_t.end());
-  auto aref = exec_aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
-  optional<MemoryFormat> memory_format;
-  Tensor out = tf.ones(size_int32_t);
+  template <ScalarType DTYPE>
+  void test_empty_out(std::vector<int32_t>&& size_int32_t) {
+    TensorFactory<DTYPE> tf;
+    std::vector<int64_t> sizes(size_int32_t.begin(), size_int32_t.end());
+    auto aref = exec_aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
+    optional<MemoryFormat> memory_format;
+    Tensor out = tf.ones(size_int32_t);
 
-  op_empty_out(aref, memory_format, out);
-}
+    op_empty_out(aref, memory_format, out);
+  }
+};
 
 #define GENERATE_TEST(_, DTYPE)                   \
-  TEST(OpEmptyOutKernelTest, DTYPE##Tensors) {    \
+  TEST_F(OpEmptyOutTest, DTYPE##Tensors) {        \
     test_empty_out<ScalarType::DTYPE>({2, 3, 4}); \
     test_empty_out<ScalarType::DTYPE>({2, 0, 4}); \
     test_empty_out<ScalarType::DTYPE>({});        \
@@ -51,7 +55,7 @@ void test_empty_out(std::vector<int32_t>&& size_int32_t) {
 
 ET_FORALL_REAL_TYPES_AND(Bool, GENERATE_TEST)
 
-TEST(OpEmptyOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpEmptyOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[2] = {3, 2};
@@ -62,7 +66,7 @@ TEST(OpEmptyOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   op_empty_out(sizes_aref, memory_format, out);
 }
 
-TEST(OpEmptyOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpEmptyOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[2] = {3, 2};
@@ -73,7 +77,7 @@ TEST(OpEmptyOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   op_empty_out(sizes_aref, memory_format, out);
 }
 
-TEST(OpEmptyOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpEmptyOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_eq_test.cpp b/kernels/test/op_eq_test.cpp
index 2ffef55bcc..ef7c247ded 100644
--- a/kernels/test/op_eq_test.cpp
+++ b/kernels/test/op_eq_test.cpp
@@ -21,34 +21,53 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_eq_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::eq_outf(context, self, other, out);
-}
+class OpEqScalarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_eq_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
+    return torch::executor::aten::eq_outf(context_, self, other, out);
+  }
 
-// Common testing for eq operator
-template <ScalarType DTYPE>
-void test_eq_scalar_out() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<ScalarType::Bool> tf_out;
+  // Common testing for eq operator
+  template <ScalarType DTYPE>
+  void test_eq_scalar_out() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<ScalarType::Bool> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 2};
+    // Destination for the eq
+    Tensor out = tf_out.ones(sizes);
+    Scalar other = 3;
+
+    // Valid input should give the expected output
+    op_eq_scalar_out(tf.make(sizes, /*data=*/{2, 3, 3, 3}), other, out);
+    EXPECT_TENSOR_EQ(
+        out, tf_out.make(sizes, /*data=*/{false, true, true, true}));
+  }
 
-  const std::vector<int32_t> sizes = {2, 2};
-  // Destination for the eq
-  Tensor out = tf_out.ones(sizes);
-  Scalar other = 3;
+  // Handle all output dtypes.
+  template <ScalarType OUTPUT_DTYPE>
+  void test_eq_all_output_dtypes() {
+    TensorFactory<ScalarType::Float> tf_float;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
 
-  // Valid input should give the expected output
-  op_eq_scalar_out(tf.make(sizes, /*data=*/{2, 3, 3, 3}), other, out);
-  EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{false, true, true, true}));
-}
+    const std::vector<int32_t> sizes = {2, 5};
 
-TEST(OpEqScalarOutKernelTest, AllRealInputBoolOutputSupport) {
+    Tensor in = tf_float.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+    Scalar other = 1;
+
+    op_eq_scalar_out(in, other, out);
+    EXPECT_TENSOR_EQ(out, tf_out.ones(sizes));
+  }
+};
+
+TEST_F(OpEqScalarOutTest, AllRealInputBoolOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) test_eq_scalar_out<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpEqScalarOutKernelTest, BoolInputDtype) {
+TEST_F(OpEqScalarOutTest, BoolInputDtype) {
   TensorFactory<ScalarType::Bool> tf_bool;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -62,7 +81,7 @@ TEST(OpEqScalarOutKernelTest, BoolInputDtype) {
 }
 
 // Mismatched shape tests.
-TEST(OpEqScalarOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpEqScalarOutTest, MismatchedShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -73,26 +92,10 @@ TEST(OpEqScalarOutKernelTest, MismatchedShapesDies) {
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
   Scalar other = 3;
 
-  ET_EXPECT_KERNEL_FAILURE(op_eq_scalar_out(a, other, out));
-}
-
-// Handle all output dtypes.
-template <ScalarType OUTPUT_DTYPE>
-void test_eq_all_output_dtypes() {
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf_float.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-  Scalar other = 1;
-
-  op_eq_scalar_out(in, other, out);
-  EXPECT_TENSOR_EQ(out, tf_out.ones(sizes));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_eq_scalar_out(a, other, out));
 }
 
-TEST(OpEqScalarOutKernelTest, AllRealOutputDTypes) {
+TEST_F(OpEqScalarOutTest, AllRealOutputDTypes) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle non-bool output dtype";
   }
@@ -115,7 +118,7 @@ dtype = "ScalarType::Int"
 out_dtype = "ScalarType::Bool"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpEqScalarOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpEqScalarOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op_out_dtype) */
@@ -135,7 +138,7 @@ TEST(OpEqScalarOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpEqScalarOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpEqScalarOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   /* %python
   out_args = "{10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op_out_dtype) */
@@ -155,7 +158,7 @@ TEST(OpEqScalarOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpEqScalarOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpEqScalarOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_erf_test.cpp b/kernels/test/op_erf_test.cpp
index 2b0608f71b..73f8d6cd5a 100644
--- a/kernels/test/op_erf_test.cpp
+++ b/kernels/test/op_erf_test.cpp
@@ -20,12 +20,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_erf_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::erf_outf(context, self, out);
-}
+class OpErfTest : public OperatorTest {
+ protected:
+  Tensor& op_erf_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::erf_outf(context_, self, out);
+  }
+};
 
-TEST(OpErfTest, SanityCheck) {
+TEST_F(OpErfTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor in = tf.make({1, 7}, {-3.0, -2.99, -1.01, 0.0, 1.01, 2.99, 3.0});
@@ -40,7 +42,7 @@ TEST(OpErfTest, SanityCheck) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST(OpErfTest, HandleBoolInput) {
+TEST_F(OpErfTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -53,7 +55,7 @@ TEST(OpErfTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_erf_out(a, out), res);
 }
 
-TEST(OpErfTest, HalfSupport) {
+TEST_F(OpErfTest, HalfSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
diff --git a/kernels/test/op_exp_test.cpp b/kernels/test/op_exp_test.cpp
index 227e04ac8c..220aad1ee1 100644
--- a/kernels/test/op_exp_test.cpp
+++ b/kernels/test/op_exp_test.cpp
@@ -22,63 +22,79 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_exp_out(const Tensor& a, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::exp_outf(context, a, out);
-}
+class OpExpOutTest : public OperatorTest {
+ protected:
+  Tensor& op_exp_out(const Tensor& a, Tensor& out) {
+    return torch::executor::aten::exp_outf(context_, a, out);
+  }
 
-template <typename CTYPE>
-CTYPE apply_log(double x) {
-  return static_cast<CTYPE>(std::log(x));
-}
+  template <typename CTYPE>
+  CTYPE apply_log(double x) {
+    return static_cast<CTYPE>(std::log(x));
+  }
 
-// Common testing for log operator
-template <typename CTYPE_IN, ScalarType DTYPE, ScalarType DTYPE_OUT>
-void test__exp_out() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<DTYPE_OUT> tf_out;
+  // Common testing for log operator
+  template <typename CTYPE_IN, ScalarType DTYPE, ScalarType DTYPE_OUT>
+  void test__exp_out() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<DTYPE_OUT> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 2};
+
+    // clang-format off
+    Tensor x = tf.make(
+        sizes,
+        {
+            apply_log<CTYPE_IN>(1.),  apply_log<CTYPE_IN>(2.),
+            apply_log<CTYPE_IN>(4.),  apply_log<CTYPE_IN>(8.),
+        });
+    // clang-format on
+
+    // clang-format off
+    Tensor expected = tf_out.make(
+        sizes,
+        {
+            1.,  2.,
+            4.,  8.,
+        });
+    // clang-format on
+
+    Tensor out = tf_out.zeros(sizes);
+
+    op_exp_out(x, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
 
-  const std::vector<int32_t> sizes = {2, 2};
+  // Unhandled output dtypes.
+  template <ScalarType OUTPUT_DTYPE>
+  void test_exp_invalid_output_dtype_dies() {
+    TensorFactory<ScalarType::Float> tf_float;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
 
-  // clang-format off
-  Tensor x = tf.make(
-      sizes,
-      {
-          apply_log<CTYPE_IN>(1.),  apply_log<CTYPE_IN>(2.),
-          apply_log<CTYPE_IN>(4.),  apply_log<CTYPE_IN>(8.),
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor expected = tf_out.make(
-      sizes,
-      {
-          1.,  2.,
-          4.,  8.,
-      });
-  // clang-format on
+    const std::vector<int32_t> sizes = {2, 5};
 
-  Tensor out = tf_out.zeros(sizes);
+    Tensor in = tf_float.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
 
-  op_exp_out(x, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
+    ET_EXPECT_KERNEL_FAILURE(context_, op_exp_out(in, out));
+  }
+};
 
-TEST(OpExpOutKernelTest, AllFloatInputFloatOutputSupport) {
+TEST_F(OpExpOutTest, AllFloatInputFloatOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test__exp_out<ctype, ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpExpOutKernelTest, AllFloatInputDoubleOutputSupport) {
+TEST_F(OpExpOutTest, AllFloatInputDoubleOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test__exp_out<ctype, ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpExpOutKernelTest, HandleBoolInput) {
+TEST_F(OpExpOutTest, HandleBoolInput) {
   // op_exp_out() handles Bool as input.
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
@@ -92,7 +108,7 @@ TEST(OpExpOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_exp_out(a, out), res);
 }
 
-TEST(OpExpOutKernelTest, HandleHalfInput) {
+TEST_F(OpExpOutTest, HandleHalfInput) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -108,7 +124,7 @@ TEST(OpExpOutKernelTest, HandleHalfInput) {
 }
 
 // Mismatched shape tests.
-TEST(OpExpOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpExpOutTest, MismatchedShapesDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -119,24 +135,10 @@ TEST(OpExpOutKernelTest, MismatchedShapesDies) {
   Tensor a = tf_int.ones(/*sizes=*/{4});
   Tensor out = tf_float.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_exp_out(a, out));
-}
-
-// Unhandled output dtypes.
-template <ScalarType OUTPUT_DTYPE>
-void test_exp_invalid_output_dtype_dies() {
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf_float.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_exp_out(in, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_exp_out(a, out));
 }
 
-TEST(OpExpOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpExpOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_exp_invalid_output_dtype_dies<ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -144,7 +146,7 @@ TEST(OpExpOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 #ifndef USE_ATEN_LIB
-TEST(OpExpOutKernelTest, DynamicOutputShape) {
+TEST_F(OpExpOutTest, DynamicOutputShape) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Float> tf_out;
 
diff --git a/kernels/test/op_expand_copy_test.cpp b/kernels/test/op_expand_copy_test.cpp
index 7f4fad4a8d..737e83f294 100644
--- a/kernels/test/op_expand_copy_test.cpp
+++ b/kernels/test/op_expand_copy_test.cpp
@@ -23,26 +23,19 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-class OpExpandOutTest : public ::testing::Test {
+class OpExpandOutTest : public OperatorTest {
  protected:
-  void SetUp() override {
-    // Since these tests cause ET_LOG to be called, the PAL must be initialized
-    // first.
-    torch::executor::runtime_init();
+  Tensor& op_expand_copy_out(
+      const Tensor& self,
+      IntArrayRef sizes,
+      bool implicit,
+      Tensor& out) {
+    return torch::executor::aten::expand_copy_outf(
+        context_, self, sizes, implicit, out);
   }
 };
 
-Tensor& op_expand_copy_out(
-    const Tensor& self,
-    IntArrayRef sizes,
-    bool implicit,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::expand_copy_outf(
-      context, self, sizes, implicit, out);
-}
-
-TEST(OpExpandOutTest, NoOp) {
+TEST_F(OpExpandOutTest, NoOp) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones({2, 2});
   Tensor out = tf.zeros({2, 2});
@@ -54,7 +47,7 @@ TEST(OpExpandOutTest, NoOp) {
   EXPECT_TENSOR_EQ(out, tf.ones({2, 2}));
 }
 
-TEST(OpExpandOutTest, PrependDims) {
+TEST_F(OpExpandOutTest, PrependDims) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones({2, 2});
   Tensor out = tf.zeros({3, 3, 3, 2, 2});
@@ -66,7 +59,7 @@ TEST(OpExpandOutTest, PrependDims) {
   EXPECT_TENSOR_EQ(out, tf.ones({3, 3, 3, 2, 2}));
 }
 
-TEST(OpExpandOutTest, GrowExistingDim) {
+TEST_F(OpExpandOutTest, GrowExistingDim) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones({2, 1});
   Tensor out = tf.zeros({2, 92});
@@ -78,7 +71,7 @@ TEST(OpExpandOutTest, GrowExistingDim) {
   EXPECT_TENSOR_EQ(out, tf.ones({2, 92}));
 }
 
-TEST(OpExpandOutTest, AllNegativeOnes) {
+TEST_F(OpExpandOutTest, AllNegativeOnes) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones({2, 4, 12});
   Tensor out = tf.zeros({2, 4, 12});
@@ -90,7 +83,7 @@ TEST(OpExpandOutTest, AllNegativeOnes) {
   EXPECT_TENSOR_EQ(out, tf.ones({2, 4, 12}));
 }
 
-TEST(OpExpandOutTest, AllNegativeOnes2) {
+TEST_F(OpExpandOutTest, AllNegativeOnes2) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones({2, 1, 12});
   Tensor out = tf.zeros({2, 1, 12});
@@ -102,7 +95,7 @@ TEST(OpExpandOutTest, AllNegativeOnes2) {
   EXPECT_TENSOR_EQ(out, tf.ones({2, 1, 12}));
 }
 
-TEST(OpExpandOutTest, EndsNegativeOnes) {
+TEST_F(OpExpandOutTest, EndsNegativeOnes) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones({2, 1, 12});
   Tensor out = tf.zeros({2, 14, 12});
@@ -114,7 +107,7 @@ TEST(OpExpandOutTest, EndsNegativeOnes) {
   EXPECT_TENSOR_EQ(out, tf.ones({2, 14, 12}));
 }
 
-TEST(OpExpandOutTest, MoreNegativeOnes) {
+TEST_F(OpExpandOutTest, MoreNegativeOnes) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones({2, 14, 1});
   Tensor out = tf.zeros({2, 14, 12});
@@ -126,7 +119,7 @@ TEST(OpExpandOutTest, MoreNegativeOnes) {
   EXPECT_TENSOR_EQ(out, tf.ones({2, 14, 12}));
 }
 
-TEST(OpExpandOutTest, BadExpandDimsTooSmall) {
+TEST_F(OpExpandOutTest, BadExpandDimsTooSmall) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones({2, 14, 1});
   Tensor out = tf.ones({2, 14}); // undefined
@@ -134,10 +127,10 @@ TEST(OpExpandOutTest, BadExpandDimsTooSmall) {
   const std::vector<int64_t> dims{2};
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_expand_copy_out(a, {dims.data(), dims.size()}, false, out));
+      context_, op_expand_copy_out(a, {dims.data(), dims.size()}, false, out));
 }
 
-TEST(OpExpandOutTest, BadLeadingNegativeOnes) {
+TEST_F(OpExpandOutTest, BadLeadingNegativeOnes) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones({2, 14, 1});
   Tensor out = tf.ones({2, 14, 1}); // undefined
@@ -145,10 +138,10 @@ TEST(OpExpandOutTest, BadLeadingNegativeOnes) {
   const std::vector<int64_t> dims{-1, -1, -1, -1, 2, 14, 1};
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_expand_copy_out(a, {dims.data(), dims.size()}, false, out));
+      context_, op_expand_copy_out(a, {dims.data(), dims.size()}, false, out));
 }
 
-TEST(OpExpandOutTest, ExpandDimsOneToN) {
+TEST_F(OpExpandOutTest, ExpandDimsOneToN) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.make(/*sizes*/ {2, 1}, /*data=*/{3, 3});
   Tensor out = tf.ones({2, 6});
@@ -162,7 +155,7 @@ TEST(OpExpandOutTest, ExpandDimsOneToN) {
       tf.make(/*sizes*/ {2, 6}, /*data=*/{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}));
 }
 
-TEST(OpExpandOutTest, ExpandOneToNPlusNewDimUniform) {
+TEST_F(OpExpandOutTest, ExpandOneToNPlusNewDimUniform) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.make(/*sizes*/ {2, 1}, /*data=*/{3, 3});
   Tensor out = tf.ones({2, 2, 6});
@@ -177,7 +170,7 @@ TEST(OpExpandOutTest, ExpandOneToNPlusNewDimUniform) {
                                                   3, 3, 3, 3, 3, 3, 3, 3}));
 }
 
-TEST(OpExpandOutTest, ExpandOneToNPlusNewDimDifferent) {
+TEST_F(OpExpandOutTest, ExpandOneToNPlusNewDimDifferent) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.make(/*sizes*/ {2, 1}, /*data=*/{1, 2});
   Tensor out = tf.ones({2, 2, 6});
@@ -192,7 +185,7 @@ TEST(OpExpandOutTest, ExpandOneToNPlusNewDimDifferent) {
                                                   1, 1, 2, 2, 2, 2, 2, 2}));
 }
 
-TEST(OpExpandOutTest, ExpandOneToNPlusNewDimDifferentTwo) {
+TEST_F(OpExpandOutTest, ExpandOneToNPlusNewDimDifferentTwo) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.make(/*sizes*/ {1, 2}, /*data=*/{42, 96});
   Tensor out = tf.ones({2, 6, 2});
@@ -208,7 +201,7 @@ TEST(OpExpandOutTest, ExpandOneToNPlusNewDimDifferentTwo) {
                                              42, 96, 42, 96, 42, 96, 42, 96}));
 }
 
-TEST(OpExpandOutTest, BadOutDataTypeGoodShapeDeath) {
+TEST_F(OpExpandOutTest, BadOutDataTypeGoodShapeDeath) {
   TensorFactory<ScalarType::Int> tf_int;
   Tensor a = tf_int.make(/*sizes*/ {1, 2}, /*data=*/{42, 96});
 
@@ -218,10 +211,10 @@ TEST(OpExpandOutTest, BadOutDataTypeGoodShapeDeath) {
   const std::vector<int64_t> dims{2, 6, 2};
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_expand_copy_out(a, {dims.data(), dims.size()}, false, out));
+      context_, op_expand_copy_out(a, {dims.data(), dims.size()}, false, out));
 }
 
-TEST(OpExpandOutTest, BadOutShapeGoodDataTypeDeath) {
+TEST_F(OpExpandOutTest, BadOutShapeGoodDataTypeDeath) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle this";
   }
@@ -232,10 +225,10 @@ TEST(OpExpandOutTest, BadOutShapeGoodDataTypeDeath) {
   const std::vector<int64_t> dims{2, 6, 2};
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_expand_copy_out(a, {dims.data(), dims.size()}, false, out));
+      context_, op_expand_copy_out(a, {dims.data(), dims.size()}, false, out));
 }
 
-TEST(OpExpandOutTest, SingleToMany) {
+TEST_F(OpExpandOutTest, SingleToMany) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.make(/*sizes*/ {1}, /*data=*/{42});
   Tensor out = tf.ones({4, 4, 4});
@@ -255,7 +248,7 @@ TEST(OpExpandOutTest, SingleToMany) {
                     42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42}));
 }
 
-TEST(OpExpandOutTest, ZeroDimInputExpand_1) {
+TEST_F(OpExpandOutTest, ZeroDimInputExpand_1) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.make(/*sizes*/ {}, /*data=*/{3});
   Tensor out = tf.ones({6});
@@ -267,7 +260,7 @@ TEST(OpExpandOutTest, ZeroDimInputExpand_1) {
   EXPECT_TENSOR_EQ(out, tf.make(/*sizes*/ {6}, /*data=*/{3, 3, 3, 3, 3, 3}));
 }
 
-TEST(OpExpandOutTest, ZeroDimInputExpand_2) {
+TEST_F(OpExpandOutTest, ZeroDimInputExpand_2) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.make(/*sizes*/ {}, /*data=*/{3});
   Tensor out = tf.ones({6, 2});
@@ -281,7 +274,7 @@ TEST(OpExpandOutTest, ZeroDimInputExpand_2) {
       tf.make(/*sizes*/ {6, 2}, /*data=*/{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}));
 }
 
-TEST(OpExpandOutTest, ZeroDimInputZeroDimOutputExpand) {
+TEST_F(OpExpandOutTest, ZeroDimInputZeroDimOutputExpand) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.make(/*sizes*/ {}, /*data=*/{3});
   Tensor out = tf.ones({});
@@ -294,7 +287,7 @@ TEST(OpExpandOutTest, ZeroDimInputZeroDimOutputExpand) {
 }
 
 #ifndef USE_ATEN_LIB
-TEST(OpExpandOutTest, ResizedOutput) {
+TEST_F(OpExpandOutTest, ResizedOutput) {
   // In this case, the output starts off with a different shape than is
   // expected. We are checking to see that dynamic shape support is working
   // correctly and that the output will be resized to the correct shape inside
@@ -319,7 +312,7 @@ TEST(OpExpandOutTest, ResizedOutput) {
 }
 #endif
 
-TEST(OpExpandOutTest, ImplicitTrue) {
+TEST_F(OpExpandOutTest, ImplicitTrue) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle this";
   }
@@ -329,7 +322,7 @@ TEST(OpExpandOutTest, ImplicitTrue) {
   const std::vector<int64_t> dims{2, 2};
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_expand_copy_out(a, {dims.data(), dims.size()}, true, out));
+      context_, op_expand_copy_out(a, {dims.data(), dims.size()}, true, out));
 }
 
 /* %python
@@ -346,7 +339,7 @@ opt_extra_params = "sizes_aref, false,"
 dtype = "ScalarType::Float"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpExpandOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpExpandOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{2, 5, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -383,7 +376,7 @@ TEST(OpExpandOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpExpandOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpExpandOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -423,7 +416,7 @@ TEST(OpExpandOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpExpandOutTest, DynamicShapeUnbound) {
+TEST_F(OpExpandOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_fill_test.cpp b/kernels/test/op_fill_test.cpp
index 06c52ecfff..0f6be0ecc6 100644
--- a/kernels/test/op_fill_test.cpp
+++ b/kernels/test/op_fill_test.cpp
@@ -21,59 +21,60 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_fill_scalar_out(const Tensor& self, const Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::fill_outf(context, self, other, out);
-}
+class OpFillTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_fill_scalar_out(const Tensor& self, const Scalar& other, Tensor& out) {
+    return torch::executor::aten::fill_outf(context_, self, other, out);
+  }
 
-Tensor&
-op_fill_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::fill_outf(context, self, other, out);
-}
+  Tensor&
+  op_fill_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::fill_outf(context_, self, other, out);
+  }
 
-template <ScalarType DTYPE>
-void test_fill_scalar_out(std::vector<int32_t>&& sizes) {
-  TensorFactory<DTYPE> tf;
+  template <ScalarType DTYPE>
+  void test_fill_scalar_out(std::vector<int32_t>&& sizes) {
+    TensorFactory<DTYPE> tf;
 
-  // Before: `out` consists of 0s.
-  Tensor self = tf.zeros(sizes);
-  Tensor out = tf.zeros(sizes);
+    // Before: `out` consists of 0s.
+    Tensor self = tf.zeros(sizes);
+    Tensor out = tf.zeros(sizes);
 
-  // After: `out` consists of 1s.
-  Scalar other = 1;
-  if (DTYPE == ScalarType::Bool) {
-    other = false;
-  }
-  op_fill_scalar_out(self, other, out);
+    // After: `out` consists of 1s.
+    Scalar other = 1;
+    if (DTYPE == ScalarType::Bool) {
+      other = false;
+    }
+    op_fill_scalar_out(self, other, out);
 
-  Tensor exp_out = tf.full(sizes, 1);
-  if (DTYPE == ScalarType::Bool) {
-    exp_out = tf.full(sizes, false);
-  }
+    Tensor exp_out = tf.full(sizes, 1);
+    if (DTYPE == ScalarType::Bool) {
+      exp_out = tf.full(sizes, false);
+    }
 
-  // Check `out` matches expected output.
-  EXPECT_TENSOR_EQ(out, exp_out);
-}
+    // Check `out` matches expected output.
+    EXPECT_TENSOR_EQ(out, exp_out);
+  }
 
-template <ScalarType DTYPE>
-void test_fill_tensor_out(std::vector<int32_t>&& sizes) {
-  TensorFactory<DTYPE> tf;
+  template <ScalarType DTYPE>
+  void test_fill_tensor_out(std::vector<int32_t>&& sizes) {
+    TensorFactory<DTYPE> tf;
 
-  // Before: `out` consists of 0s.
-  Tensor self = tf.zeros(sizes);
-  Tensor out = tf.zeros(sizes);
+    // Before: `out` consists of 0s.
+    Tensor self = tf.zeros(sizes);
+    Tensor out = tf.zeros(sizes);
 
-  // After: `out` consists of 1s.
-  Tensor other = tf.ones({});
-  op_fill_tensor_out(self, other, out);
+    // After: `out` consists of 1s.
+    Tensor other = tf.ones({});
+    op_fill_tensor_out(self, other, out);
 
-  Tensor exp_out = tf.full(sizes, 1);
+    Tensor exp_out = tf.full(sizes, 1);
 
-  // Check `out` matches expected output.
-  EXPECT_TENSOR_EQ(out, exp_out);
-}
+    // Check `out` matches expected output.
+    EXPECT_TENSOR_EQ(out, exp_out);
+  }
+};
 
 // A macro for defining tests for both scalar and tensor variants of
 // `fill_out`. Here the `self` and `out` tensors will be created according
@@ -87,7 +88,7 @@ void test_fill_tensor_out(std::vector<int32_t>&& sizes) {
 
 // Create input support tests for scalar variant.
 #define GENERATE_SCALAR_INPUT_SUPPORT_TEST(_, DTYPE) \
-  TEST(OpFillTest, DTYPE##ScalarInputSupport) {      \
+  TEST_F(OpFillTest, DTYPE##ScalarInputSupport) {    \
     TEST_FILL_OUT(test_fill_scalar_out, DTYPE);      \
   }
 
@@ -95,13 +96,13 @@ ET_FORALL_REAL_TYPES_AND(Bool, GENERATE_SCALAR_INPUT_SUPPORT_TEST)
 
 // Create input support tests for tensor variant.
 #define GENERATE_TENSOR_INPUT_SUPPORT_TEST(_, DTYPE) \
-  TEST(OpFillTest, DTYPE##TensorInputSupport) {      \
+  TEST_F(OpFillTest, DTYPE##TensorInputSupport) {    \
     TEST_FILL_OUT(test_fill_tensor_out, DTYPE);      \
   }
 
 ET_FORALL_REAL_TYPES_AND(Bool, GENERATE_TENSOR_INPUT_SUPPORT_TEST)
 
-TEST(OpFillTest, MismatchedOtherPropertiesDies) {
+TEST_F(OpFillTest, MismatchedOtherPropertiesDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // `self` and `out` have different shapes but same dtype.
@@ -124,12 +125,12 @@ TEST(OpFillTest, MismatchedOtherPropertiesDies) {
   EXPECT_EQ(other3.numel(), 9);
 
   // Assert `other` tensors with incompatible properties fails.
-  ET_EXPECT_KERNEL_FAILURE(op_fill_tensor_out(self, other1, out));
-  ET_EXPECT_KERNEL_FAILURE(op_fill_tensor_out(self, other2, out));
-  ET_EXPECT_KERNEL_FAILURE(op_fill_tensor_out(self, other3, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_fill_tensor_out(self, other1, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_fill_tensor_out(self, other2, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_fill_tensor_out(self, other3, out));
 }
 
-TEST(OpFillTest, MismatchedOutputShapesDies) {
+TEST_F(OpFillTest, MismatchedOutputShapesDies) {
   // Skip ATen test since it supports `self` and `out` having different shapes.
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched output shape";
@@ -142,10 +143,10 @@ TEST(OpFillTest, MismatchedOutputShapesDies) {
   Tensor out = tf.zeros({2, 2});
 
   // Assert `out` can't be filled due to incompatible shapes.
-  ET_EXPECT_KERNEL_FAILURE(op_fill_scalar_out(self, 0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_fill_scalar_out(self, 0, out));
 }
 
-TEST(OpFillTest, MismatchedOutputDtypeDies) {
+TEST_F(OpFillTest, MismatchedOutputDtypeDies) {
   TensorFactory<ScalarType::Byte> tf_byte;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -154,5 +155,5 @@ TEST(OpFillTest, MismatchedOutputDtypeDies) {
   Tensor out = tf_float.ones({2, 2});
 
   // Assert `out` can't be filled due to incompatible dtype.
-  ET_EXPECT_KERNEL_FAILURE(op_fill_scalar_out(self, 0.0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_fill_scalar_out(self, 0.0, out));
 }
diff --git a/kernels/test/op_floor_divide_test.cpp b/kernels/test/op_floor_divide_test.cpp
index 28a2ad9995..4dc337cd3a 100644
--- a/kernels/test/op_floor_divide_test.cpp
+++ b/kernels/test/op_floor_divide_test.cpp
@@ -20,34 +20,57 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_floor_divide_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::floor_divide_outf(context, self, other, out);
-}
+class OpFloorDivideTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_floor_divide_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::floor_divide_outf(context_, self, other, out);
+  }
 
-// Common testing for floor-dividing two integer Tensors.
-template <ScalarType DTYPE>
-void test_integer_floor_divide() {
-  TensorFactory<DTYPE> tf;
+  template <ScalarType DTYPE>
+  void test_integer_floor_divide() {
+    TensorFactory<DTYPE> tf;
 
-  const std::vector<int32_t> sizes = {3, 2};
+    const std::vector<int32_t> sizes = {3, 2};
 
-  // Destination for the floor_divide.
-  Tensor out = tf.zeros(sizes);
+    // Destination for the floor_divide.
+    Tensor out = tf.zeros(sizes);
 
-  // floor_divide two tensors.
-  // Integer division of -8 / 6 return -1, but -8 // 6 is -2
-  op_floor_divide_out(
-      tf.make(sizes, /*data=*/{-8, 1, 2, 4, 8, 3}),
-      tf.make(sizes, /*data=*/{6, 2, 2, 2, 2, -5}),
-      out);
+    // floor_divide two tensors.
+    // Integer division of -8 / 6 return -1, but -8 // 6 is -2
+    op_floor_divide_out(
+        tf.make(sizes, /*data=*/{-8, 1, 2, 4, 8, 3}),
+        tf.make(sizes, /*data=*/{6, 2, 2, 2, 2, -5}),
+        out);
 
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{-2, 0, 1, 2, 4, -1}));
-}
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{-2, 0, 1, 2, 4, -1}));
+  }
+
+  template <ScalarType DTYPE>
+  void test_floating_point_floor_divide() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {3, 2};
+
+    // Destination for the floor_divide.
+    Tensor out = tf.zeros(sizes);
+
+    // floor_divide two tensors.
+    // std::floor(-0.5 / -0.1) == 5.0, but -0.5 // -0.1 yeilds 4.0
+    op_floor_divide_out(
+        tf.make(sizes, /*data=*/{-5.3, 1.1, 2.2, 4.4, 6.8, -0.5}),
+        tf.make(sizes, /*data=*/{2.7, 2.0, 2.0, 2.0, 2.0, -0.1}),
+        out);
 
-TEST(OpFloorDivideKernelTest, ByteTensors) {
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out, tf.make(sizes, /*data=*/{-2.0, 0.0, 1.0, 2.0, 3.0, 4.0}));
+  }
+};
+
+// Common testing for floor-dividing two integer Tensors.
+TEST_F(OpFloorDivideTest, ByteTensors) {
   TensorFactory<ScalarType::Byte> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -65,53 +88,32 @@ TEST(OpFloorDivideKernelTest, ByteTensors) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{0, 1, 2, 4}));
 }
 
-TEST(OpFloorDivideKernelTest, CharTensors) {
+TEST_F(OpFloorDivideTest, CharTensors) {
   test_integer_floor_divide<ScalarType::Char>();
 }
 
-TEST(OpFloorDivideKernelTest, ShortTensors) {
+TEST_F(OpFloorDivideTest, ShortTensors) {
   test_integer_floor_divide<ScalarType::Short>();
 }
 
-TEST(OpFloorDivideKernelTest, IntTensors) {
+TEST_F(OpFloorDivideTest, IntTensors) {
   test_integer_floor_divide<ScalarType::Int>();
 }
 
-TEST(OpFloorDivideKernelTest, LongTensors) {
+TEST_F(OpFloorDivideTest, LongTensors) {
   test_integer_floor_divide<ScalarType::Long>();
 }
 
 // Common testing for floor-dividing two floating point Tensors.
-template <ScalarType DTYPE>
-void test_floating_point_floor_divide() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {3, 2};
-
-  // Destination for the floor_divide.
-  Tensor out = tf.zeros(sizes);
-
-  // floor_divide two tensors.
-  // std::floor(-0.5 / -0.1) == 5.0, but -0.5 // -0.1 yeilds 4.0
-  op_floor_divide_out(
-      tf.make(sizes, /*data=*/{-5.3, 1.1, 2.2, 4.4, 6.8, -0.5}),
-      tf.make(sizes, /*data=*/{2.7, 2.0, 2.0, 2.0, 2.0, -0.1}),
-      out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out, tf.make(sizes, /*data=*/{-2.0, 0.0, 1.0, 2.0, 3.0, 4.0}));
-}
-
-TEST(OpFloorDivideKernelTest, FloatTensors) {
+TEST_F(OpFloorDivideTest, FloatTensors) {
   test_floating_point_floor_divide<ScalarType::Float>();
 }
 
-TEST(OpFloorDivideKernelTest, DoubleTensors) {
+TEST_F(OpFloorDivideTest, DoubleTensors) {
   test_floating_point_floor_divide<ScalarType::Double>();
 }
 
-TEST(OpFloorDivideKernelTest, UnhandledDtypeDies) {
+TEST_F(OpFloorDivideTest, UnhandledDtypeDies) {
   // floor_divide() doesn't handle Bool.
   TensorFactory<ScalarType::Bool> tf;
 
@@ -126,12 +128,12 @@ TEST(OpFloorDivideKernelTest, UnhandledDtypeDies) {
 
   // Dividing the two boolean tensors should cause an assertion and kill the
   // test process.
-  ET_EXPECT_KERNEL_FAILURE(op_floor_divide_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_floor_divide_out(a, b, out));
 }
 
 // Mismatched shape tests.
 
-TEST(OpFloorDivideKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpFloorDivideTest, MismatchedInputShapesDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Addends with different shapes.
@@ -143,10 +145,10 @@ TEST(OpFloorDivideKernelTest, MismatchedInputShapesDies) {
 
   // Adding the two mismatched tensors should cause an assertion and kill the
   // test process.
-  ET_EXPECT_KERNEL_FAILURE(op_floor_divide_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_floor_divide_out(a, b, out));
 }
 
-TEST(OpFloorDivideKernelTest, MismatchedOutputShapesDies) {
+TEST_F(OpFloorDivideTest, MismatchedOutputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched output shape";
   }
@@ -163,10 +165,10 @@ TEST(OpFloorDivideKernelTest, MismatchedOutputShapesDies) {
 
   // Adding the tensors into a mismatched output should cause an assertion and
   // kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_floor_divide_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_floor_divide_out(a, b, out));
 }
 
-TEST(OpFloorDivideKernelTest, BroadcastDimSizeIsOneAB) {
+TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneAB) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -186,7 +188,7 @@ TEST(OpFloorDivideKernelTest, BroadcastDimSizeIsOneAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpFloorDivideKernelTest, BroadcastDimSizeMissingAB) {
+TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingAB) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -206,7 +208,7 @@ TEST(OpFloorDivideKernelTest, BroadcastDimSizeMissingAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpFloorDivideKernelTest, BroadcastDimSizeIsOneBA) {
+TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneBA) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -226,7 +228,7 @@ TEST(OpFloorDivideKernelTest, BroadcastDimSizeIsOneBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpFloorDivideKernelTest, BroadcastDimSizeMissingBA) {
+TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingBA) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -246,7 +248,7 @@ TEST(OpFloorDivideKernelTest, BroadcastDimSizeMissingBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpFloorDivideKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundSameAsExpected) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -274,7 +276,7 @@ TEST(OpFloorDivideKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpFloorDivideKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundLargerThanExpected) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -302,7 +304,7 @@ TEST(OpFloorDivideKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpFloorDivideKernelTest, DynamicShapeUnbound) {
+TEST_F(OpFloorDivideTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_floor_test.cpp b/kernels/test/op_floor_test.cpp
index 3c582482d0..4f39f77dae 100644
--- a/kernels/test/op_floor_test.cpp
+++ b/kernels/test/op_floor_test.cpp
@@ -20,12 +20,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_floor_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::floor_outf(context, self, out);
-}
+class OpFloorTest : public OperatorTest {
+ protected:
+  Tensor& op_floor_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::floor_outf(context_, self, out);
+  }
+};
 
-TEST(OpFloorTest, SanityCheck) {
+TEST_F(OpFloorTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor in = tf.make({1, 7}, {-3.0, -2.99, -1.01, 0.0, 1.01, 2.99, 3.0});
@@ -38,7 +40,7 @@ TEST(OpFloorTest, SanityCheck) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpFloorTest, HalfSupport) {
+TEST_F(OpFloorTest, HalfSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
diff --git a/kernels/test/op_fmod_test.cpp b/kernels/test/op_fmod_test.cpp
index e24b4be306..475d4ea5cb 100644
--- a/kernels/test/op_fmod_test.cpp
+++ b/kernels/test/op_fmod_test.cpp
@@ -20,14 +20,15 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_fmod_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::fmod_outf(context, self, other, out);
-}
+class OpFmodTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_fmod_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::fmod_outf(context_, self, other, out);
+  }
 
-Tensor&
-op_fmod_scalar_out(const Tensor& self, const Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::fmod_outf(context, self, other, out);
-}
+  Tensor&
+  op_fmod_scalar_out(const Tensor& self, const Scalar& other, Tensor& out) {
+    return torch::executor::aten::fmod_outf(context_, self, other, out);
+  }
+};
diff --git a/kernels/test/op_full_like_test.cpp b/kernels/test/op_full_like_test.cpp
index fbe232be3a..4916a882fa 100644
--- a/kernels/test/op_full_like_test.cpp
+++ b/kernels/test/op_full_like_test.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
 #include <executorch/kernels/test/supported_features.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
@@ -23,71 +24,74 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_full_like_out(
-    const Tensor& self,
-    const Scalar& fill_value,
-    optional<MemoryFormat> memory_format,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::full_like_outf(
-      context, self, fill_value, memory_format, out);
-}
+class OpFullLikeTest : public OperatorTest {
+ protected:
+  Tensor& op_full_like_out(
+      const Tensor& self,
+      const Scalar& fill_value,
+      optional<MemoryFormat> memory_format,
+      Tensor& out) {
+    return torch::executor::aten::full_like_outf(
+        context_, self, fill_value, memory_format, out);
+  }
 
-template <ScalarType DTYPE>
-void test_full_like_out() {
-  TensorFactory<DTYPE> tf;
-  const std::vector<int32_t> sizes = {2, 2};
-  Tensor in = tf.zeros(sizes);
-  Tensor out = tf.zeros(sizes);
-  Scalar value = 42;
-  MemoryFormat memory_format = MemoryFormat::Contiguous;
-
-  // Check that it matches the expected output.
-  op_full_like_out(in, value, memory_format, out);
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{42, 42, 42, 42}));
-
-  value = 1;
-  op_full_like_out(in, value, memory_format, out);
-  EXPECT_TENSOR_EQ(out, tf.ones(sizes));
-}
+  template <ScalarType DTYPE>
+  void test_full_like_out() {
+    TensorFactory<DTYPE> tf;
+    const std::vector<int32_t> sizes = {2, 2};
+    Tensor in = tf.zeros(sizes);
+    Tensor out = tf.zeros(sizes);
+    Scalar value = 42;
+    MemoryFormat memory_format = MemoryFormat::Contiguous;
+
+    // Check that it matches the expected output.
+    op_full_like_out(in, value, memory_format, out);
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{42, 42, 42, 42}));
+
+    value = 1;
+    op_full_like_out(in, value, memory_format, out);
+    EXPECT_TENSOR_EQ(out, tf.ones(sizes));
+  }
 
-template <>
-void test_full_like_out<ScalarType::Bool>() {
-  TensorFactory<ScalarType::Bool> tf;
-  const std::vector<int32_t> sizes = {2, 2};
-  Tensor in = tf.zeros(sizes);
-  Tensor out = tf.zeros(sizes);
-  Scalar value = true;
-  MemoryFormat memory_format = MemoryFormat::Contiguous;
-
-  // Check that it matches the expected output.
-  op_full_like_out(in, value, memory_format, out);
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{true, true, true, true}));
-
-  value = false;
-  op_full_like_out(in, value, memory_format, out);
-  EXPECT_TENSOR_EQ(out, tf.zeros(sizes));
-}
+  template <>
+  void test_full_like_out<ScalarType::Bool>() {
+    TensorFactory<ScalarType::Bool> tf;
+    const std::vector<int32_t> sizes = {2, 2};
+    Tensor in = tf.zeros(sizes);
+    Tensor out = tf.zeros(sizes);
+    Scalar value = true;
+    MemoryFormat memory_format = MemoryFormat::Contiguous;
+
+    // Check that it matches the expected output.
+    op_full_like_out(in, value, memory_format, out);
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{true, true, true, true}));
+
+    value = false;
+    op_full_like_out(in, value, memory_format, out);
+    EXPECT_TENSOR_EQ(out, tf.zeros(sizes));
+  }
 
-TEST(OpFullLikeTest, AllRealOutputPasses) {
+  template <ScalarType DTYPE>
+  void test_full_like_out_mismatched_shape() {
+    TensorFactory<DTYPE> tf;
+    const std::vector<int32_t> sizes = {2, 2};
+    Tensor in = tf.zeros(/*sizes=*/{2, 2});
+    Tensor out = tf.zeros(/*sizes=*/{4, 2});
+    Scalar value = 42;
+    MemoryFormat memory_format;
+
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_full_like_out(in, value, memory_format, out));
+  }
+};
+
+TEST_F(OpFullLikeTest, AllRealOutputPasses) {
 #define TEST_ENTRY(ctype, dtype) test_full_like_out<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-template <ScalarType DTYPE>
-void test_full_like_out_mismatched_shape() {
-  TensorFactory<DTYPE> tf;
-  const std::vector<int32_t> sizes = {2, 2};
-  Tensor in = tf.zeros(/*sizes=*/{2, 2});
-  Tensor out = tf.zeros(/*sizes=*/{4, 2});
-  Scalar value = 42;
-  MemoryFormat memory_format;
-
-  ET_EXPECT_DEATH(op_full_like_out(in, value, memory_format, out), "");
-}
-
-TEST(OpFullLikeTest, MismatchedShapeDies) {
+TEST_F(OpFullLikeTest, MismatchedShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -97,7 +101,7 @@ TEST(OpFullLikeTest, MismatchedShapeDies) {
 #undef TEST_ENTRY
 }
 
-TEST(OpFullLikeTest, SimpleGeneratedCase) {
+TEST_F(OpFullLikeTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -126,7 +130,7 @@ TEST(OpFullLikeTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpFullLikeTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpFullLikeTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -145,7 +149,7 @@ TEST(OpFullLikeTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpFullLikeTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpFullLikeTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -164,7 +168,7 @@ TEST(OpFullLikeTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpFullLikeTest, DynamicShapeUnbound) {
+TEST_F(OpFullLikeTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_full_test.cpp b/kernels/test/op_full_test.cpp
index 021bcb754a..82aaea0333 100644
--- a/kernels/test/op_full_test.cpp
+++ b/kernels/test/op_full_test.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
 #include <executorch/kernels/test/supported_features.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
@@ -24,29 +25,31 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_full_out(const IntArrayRef sizes, const Scalar& fill_value, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::full_outf(context, sizes, fill_value, out);
-}
+class OpFullOutTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_full_out(const IntArrayRef sizes, const Scalar& fill_value, Tensor& out) {
+    return torch::executor::aten::full_outf(context_, sizes, fill_value, out);
+  }
 
-template <ScalarType DTYPE>
-void test_ones_out(std::vector<int32_t>&& size_int32_t) {
-  TensorFactory<DTYPE> tf;
-  std::vector<int64_t> size_int64_t(size_int32_t.begin(), size_int32_t.end());
-  auto aref = IntArrayRef(size_int64_t.data(), size_int64_t.size());
+  template <ScalarType DTYPE>
+  void test_ones_out(std::vector<int32_t>&& size_int32_t) {
+    TensorFactory<DTYPE> tf;
+    std::vector<int64_t> size_int64_t(size_int32_t.begin(), size_int32_t.end());
+    auto aref = IntArrayRef(size_int64_t.data(), size_int64_t.size());
 
-  // Before: `out` consists of 0s.
-  Tensor out = tf.zeros(size_int32_t);
+    // Before: `out` consists of 0s.
+    Tensor out = tf.zeros(size_int32_t);
 
-  // After: `out` consists of 1s.
-  op_full_out(aref, 1, out);
+    // After: `out` consists of 1s.
+    op_full_out(aref, 1, out);
 
-  EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t));
-}
+    EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t));
+  }
+};
 
 #define GENERATE_TEST(_, DTYPE)                  \
-  TEST(OpFullOutTest, DTYPE##Tensors) {          \
+  TEST_F(OpFullOutTest, DTYPE##Tensors) {        \
     test_ones_out<ScalarType::DTYPE>({});        \
     test_ones_out<ScalarType::DTYPE>({1});       \
     test_ones_out<ScalarType::DTYPE>({1, 1, 1}); \
diff --git a/kernels/test/op_ge_test.cpp b/kernels/test/op_ge_test.cpp
index de24b57c2c..5e7414a735 100644
--- a/kernels/test/op_ge_test.cpp
+++ b/kernels/test/op_ge_test.cpp
@@ -21,31 +21,49 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_ge_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::ge_outf(context, self, other, out);
-}
+class OpGeTensorOutTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_ge_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::ge_outf(context_, self, other, out);
+  }
 
-Tensor& op_ge_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::ge_outf(context, self, other, out);
-}
+  template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
+  void test_dtype() {
+    TensorFactory<DTYPE_IN> tf_input;
+    TensorFactory<DTYPE_OUT> tf_out;
+    Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
+    Tensor b = tf_input.make({2, 2}, {1, 4, 2, 3});
+    Tensor out = tf_out.zeros({2, 2});
 
-template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
-void test_ge_scalar_out() {
-  TensorFactory<DTYPE_IN> tf;
-  TensorFactory<DTYPE_OUT> tf_out;
+    op_ge_tensor_out(a, b, out);
+    EXPECT_TENSOR_EQ(out, tf_out.make({2, 2}, {true, false, true, true}));
+  }
+};
 
-  const std::vector<int32_t> sizes = {2, 2};
-  Tensor out = tf_out.ones(sizes);
-  Scalar other = 2;
+class OpGeScalarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_ge_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
+    return torch::executor::aten::ge_outf(context_, self, other, out);
+  }
 
-  // Valid input should give the expected output
-  op_ge_scalar_out(tf.make(sizes, /*data=*/{3, 1, 2, 4}), other, out);
-  EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{true, false, true, true}));
-}
+  template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
+  void test_ge_scalar_out() {
+    TensorFactory<DTYPE_IN> tf;
+    TensorFactory<DTYPE_OUT> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 2};
+    Tensor out = tf_out.ones(sizes);
+    Scalar other = 2;
+
+    // Valid input should give the expected output
+    op_ge_scalar_out(tf.make(sizes, /*data=*/{3, 1, 2, 4}), other, out);
+    EXPECT_TENSOR_EQ(
+        out, tf_out.make(sizes, /*data=*/{true, false, true, true}));
+  }
+};
 
-TEST(OpGeScalarOutKernelTest, AllRealInputBoolOutputSupport) {
+TEST_F(OpGeScalarOutTest, AllRealInputBoolOutputSupport) {
 #define TEST_ENTRY(ctype_in, dtype_in, ctype_out, dtype_out) \
   test_ge_scalar_out<ScalarType::dtype_in, ScalarType::dtype_out>();
 
@@ -59,7 +77,7 @@ TEST(OpGeScalarOutKernelTest, AllRealInputBoolOutputSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpGeScalarOutKernelTest, BoolInputDtype) {
+TEST_F(OpGeScalarOutTest, BoolInputDtype) {
   TensorFactory<ScalarType::Bool> tf_bool;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -73,7 +91,7 @@ TEST(OpGeScalarOutKernelTest, BoolInputDtype) {
 }
 
 // Mismatched shape tests.
-TEST(OpGeScalarOutKernelTest, MismatchedInOutShapesDies) {
+TEST_F(OpGeScalarOutTest, MismatchedInOutShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -84,10 +102,10 @@ TEST(OpGeScalarOutKernelTest, MismatchedInOutShapesDies) {
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
   Scalar other = 3;
 
-  ET_EXPECT_KERNEL_FAILURE(op_ge_scalar_out(a, other, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_ge_scalar_out(a, other, out));
 }
 
-TEST(OpGeScalarOutKernelTest, DynamicOutShapeTest) {
+TEST_F(OpGeScalarOutTest, DynamicOutShapeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -102,19 +120,7 @@ TEST(OpGeScalarOutKernelTest, DynamicOutShapeTest) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{true, false, true, true}));
 }
 
-template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
-void test_dtype() {
-  TensorFactory<DTYPE_IN> tf_input;
-  TensorFactory<DTYPE_OUT> tf_out;
-  Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
-  Tensor b = tf_input.make({2, 2}, {1, 4, 2, 3});
-  Tensor out = tf_out.zeros({2, 2});
-
-  op_ge_tensor_out(a, b, out);
-  EXPECT_TENSOR_EQ(out, tf_out.make({2, 2}, {true, false, true, true}));
-}
-
-TEST(OpGeTensorOutKernelTest, AllDtypesSupported) {
+TEST_F(OpGeTensorOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype_in, dtype_in, ctype_out, dtype_out) \
   test_dtype<ScalarType::dtype_in, ScalarType::dtype_out>();
 
@@ -128,7 +134,7 @@ TEST(OpGeTensorOutKernelTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
-TEST(OpGeTensorOutKernelTest, MismatchedInShapesDies) {
+TEST_F(OpGeTensorOutTest, MismatchedInShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -139,10 +145,10 @@ TEST(OpGeTensorOutKernelTest, MismatchedInShapesDies) {
   Tensor b = tf_int.ones(/*sizes=*/{2, 2});
   Tensor out = tf_bool.ones(/*sizes=*/{4});
 
-  ET_EXPECT_KERNEL_FAILURE(op_ge_tensor_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_ge_tensor_out(a, b, out));
 }
 
-TEST(OpGeTensorOutKernelTest, MismatchedInOutShapesDies) {
+TEST_F(OpGeTensorOutTest, MismatchedInOutShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -153,10 +159,10 @@ TEST(OpGeTensorOutKernelTest, MismatchedInOutShapesDies) {
   Tensor b = tf_int.ones(/*sizes=*/{4});
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_ge_tensor_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_ge_tensor_out(a, b, out));
 }
 
-TEST(OpGeTensorOutKernelTest, DynamicOutShapeTest) {
+TEST_F(OpGeTensorOutTest, DynamicOutShapeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor a = tf.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
diff --git a/kernels/test/op_gelu_test.cpp b/kernels/test/op_gelu_test.cpp
index a4aa8b49e6..7155bfb1b7 100644
--- a/kernels/test/op_gelu_test.cpp
+++ b/kernels/test/op_gelu_test.cpp
@@ -22,52 +22,55 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_gelu_out(const Tensor& self, string_view approximate, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::gelu_outf(context, self, approximate, out);
-}
+class OpGeluTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_gelu_out(const Tensor& self, string_view approximate, Tensor& out) {
+    return torch::executor::aten::gelu_outf(context_, self, approximate, out);
+  }
 
-// Common testing for gelu on two floating point Tensors.
-template <ScalarType DTYPE>
-void test_gelu_execution() {
-  TensorFactory<DTYPE> tf;
+  // Common testing for gelu on two floating point Tensors.
+  template <ScalarType DTYPE>
+  void test_gelu_execution() {
+    TensorFactory<DTYPE> tf;
 
-  const std::vector<int32_t> sizes = {3, 2};
+    const std::vector<int32_t> sizes = {3, 2};
 
-  Tensor in = tf.make(
-      sizes, /*data=*/{-0.4775, 0.2948, -0.3984, 1.8690, -0.4048, -0.4848});
+    Tensor in = tf.make(
+        sizes, /*data=*/{-0.4775, 0.2948, -0.3984, 1.8690, -0.4048, -0.4848});
 
-  // Destination for the gelu.
-  Tensor out = tf.zeros(sizes);
+    // Destination for the gelu.
+    Tensor out = tf.zeros(sizes);
 
-  // Run full gelu.
-  op_gelu_out(in, "none", out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf.make(
-          sizes,
-          /*data=*/
-          {-0.15113, 0.181575, -0.137515, 1.81141, -0.13877, -0.152183}));
-
-  // Run tanh gelu appx.
-  op_gelu_out(in, "tanh", out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf.make(
-          sizes,
-          /*data=*/
-          {-0.151145, 0.181573, -0.137522, 1.8114, -0.138778, -0.152199}));
-}
+    // Run full gelu.
+    op_gelu_out(in, "none", out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf.make(
+            sizes,
+            /*data=*/
+            {-0.15113, 0.181575, -0.137515, 1.81141, -0.13877, -0.152183}));
+
+    // Run tanh gelu appx.
+    op_gelu_out(in, "tanh", out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf.make(
+            sizes,
+            /*data=*/
+            {-0.151145, 0.181573, -0.137522, 1.8114, -0.138778, -0.152199}));
+  }
+};
 
-TEST(OpGeluKernelTest, FloatTensors) {
+TEST_F(OpGeluTest, FloatTensors) {
   test_gelu_execution<ScalarType::Float>();
 }
 
-TEST(OpGeluKernelTest, DoubleTensors) {
+TEST_F(OpGeluTest, DoubleTensors) {
   if (!SupportedFeatures::get()->op_gelu_dtype_double) {
     GTEST_SKIP();
   }
@@ -75,7 +78,7 @@ TEST(OpGeluKernelTest, DoubleTensors) {
   test_gelu_execution<ScalarType::Double>();
 }
 
-TEST(OpGeluKernelTest, UnhandledDtypeDies) {
+TEST_F(OpGeluTest, UnhandledDtypeDies) {
   // gelu() doesn't handle Bool.
   TensorFactory<ScalarType::Bool> tf;
 
@@ -86,12 +89,12 @@ TEST(OpGeluKernelTest, UnhandledDtypeDies) {
   // Destination for the gelu.
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_gelu_out(a, "none", out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_gelu_out(a, "none", out));
 }
 
 // The output tensor may not have a dtype different from the inputs even if it
 // has the same shape.
-TEST(OpGeluKernelTest, MismatchedOutputDtypeDies) {
+TEST_F(OpGeluTest, MismatchedOutputDtypeDies) {
   // Two different dtypes. This test uses two types with the same size to
   // demonstrate that the ScalarType itself matters, not the size of the
   // tensor elements.
@@ -107,10 +110,10 @@ TEST(OpGeluKernelTest, MismatchedOutputDtypeDies) {
 
   // Running Gelu on an input into an output of a different dtype should kill
   // the program
-  ET_EXPECT_KERNEL_FAILURE(op_gelu_out(a, "none", out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_gelu_out(a, "none", out));
 }
 
-TEST(OpGeluKernelTest, InvalidAppxStringDies) {
+TEST_F(OpGeluTest, InvalidAppxStringDies) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.ones(/*sizes=*/{4});
@@ -119,10 +122,10 @@ TEST(OpGeluKernelTest, InvalidAppxStringDies) {
   Tensor out = tf.zeros(/*sizes=*/{4});
 
   // Running Gelu with an invalid appx method should kill the program.
-  ET_EXPECT_KERNEL_FAILURE(op_gelu_out(a, "foo", out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_gelu_out(a, "foo", out));
 }
 
-TEST(OpGeluKernelTest, SimpleGeneratedCase) {
+TEST_F(OpGeluTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -176,7 +179,7 @@ TEST(OpGeluKernelTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpGeluKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpGeluTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -202,7 +205,7 @@ TEST(OpGeluKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpGeluKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpGeluTest, DynamicShapeUpperBoundLargerThanExpected) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -229,7 +232,7 @@ TEST(OpGeluKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpGeluKernelTest, DynamicShapeUnbound) {
+TEST_F(OpGeluTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_glu_test.cpp b/kernels/test/op_glu_test.cpp
index 70f6913aea..ca5fb5f6f6 100644
--- a/kernels/test/op_glu_test.cpp
+++ b/kernels/test/op_glu_test.cpp
@@ -22,52 +22,113 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_glu_out(const Tensor& self, int64_t dim, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::glu_outf(context, self, dim, out);
-}
+class OpGluOutTest : public OperatorTest {
+ protected:
+  Tensor& op_glu_out(const Tensor& self, int64_t dim, Tensor& out) {
+    return torch::executor::aten::glu_outf(context_, self, dim, out);
+  }
 
-// Common testing for glu operator
-template <ScalarType DTYPE, ScalarType OUT_DTYPE>
-void test_glu_out() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<OUT_DTYPE> tf_out;
+  // Common testing for glu operator
+  template <ScalarType DTYPE, ScalarType OUT_DTYPE>
+  void test_glu_out() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {4, 2};
+    const std::vector<int32_t> out_sizes_1 = {2, 2};
+
+    // Valid input should give the expected output
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(out_sizes_1);
+    op_glu_out(in, 0, out);
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make(
+            out_sizes_1, /*data=*/{0.731059, 0.731059, 0.731059, 0.731059}));
+    const std::vector<int32_t> out_sizes_2 = {4, 1};
+    out = tf_out.zeros(out_sizes_2);
+    op_glu_out(in, 1, out);
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make(
+            out_sizes_2, /*data=*/{0.731059, 0.731059, 0.731059, 0.731059}));
+  }
 
-  const std::vector<int32_t> sizes = {4, 2};
-  const std::vector<int32_t> out_sizes_1 = {2, 2};
+  // Mismatched shape tests.
+  template <ScalarType INPUT_DTYPE>
+  void test_glu_out_mismatched_shape() {
+    TensorFactory<INPUT_DTYPE> tf_in;
 
-  // Valid input should give the expected output
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(out_sizes_1);
-  op_glu_out(in, 0, out);
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make(
-          out_sizes_1, /*data=*/{0.731059, 0.731059, 0.731059, 0.731059}));
-  const std::vector<int32_t> out_sizes_2 = {4, 1};
-  out = tf_out.zeros(out_sizes_2);
-  op_glu_out(in, 1, out);
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make(
-          out_sizes_2, /*data=*/{0.731059, 0.731059, 0.731059, 0.731059}));
-}
+    // Input tensor and out tensor dimension size mismatch
+    Tensor in = tf_in.zeros(/*sizes=*/{4, 4, 4});
+    Tensor out = tf_in.zeros(/*sizes=*/{2, 4, 2});
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_glu_out(in, 0, out));
+
+    out = tf_in.zeros(/*sizes=*/{4, 4, 4});
+    ET_EXPECT_KERNEL_FAILURE(context_, op_glu_out(in, 0, out));
+  }
+
+  // Invalid dimensions tests.
+  template <ScalarType INPUT_DTYPE>
+  void test_glu_out_invalid_dim() {
+    TensorFactory<INPUT_DTYPE> tf_in;
+    Tensor in = tf_in.zeros(/*sizes=*/{2, 2});
+    const std::vector<int32_t> out_sizes = {1, 2};
+    Tensor out = tf_in.zeros(out_sizes);
+
+    // Dim is not valid
+    ET_EXPECT_KERNEL_FAILURE(context_, op_glu_out(in, 3, out));
+
+    // Dim size is not even
+    in = tf_in.zeros(/*sizes=*/{3, 2});
+    ET_EXPECT_KERNEL_FAILURE(context_, op_glu_out(in, 0, out));
+  }
+
+  // Unhandled input dtypes.
+  template <ScalarType INPUT_DTYPE>
+  void test_div_invalid_input_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf_in;
+    TensorFactory<ScalarType::Float> tf_float;
+
+    const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> out_sizes = {1, 2};
+    Tensor in = tf_in.ones(sizes);
+    Tensor out = tf_float.zeros(out_sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_glu_out(in, 0, out));
+  }
 
-TEST(OpGluOutKernelTest, AllInputFloatOutputSupport) {
+  // Unhandled output dtypes.
+  template <ScalarType OUTPUT_DTYPE>
+  void test_div_invalid_output_dtype_dies() {
+    TensorFactory<ScalarType::Float> tf_float;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> out_sizes = {1, 2};
+    Tensor in = tf_float.ones(sizes);
+    Tensor out = tf_out.zeros(out_sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_glu_out(in, 0, out));
+  }
+};
+
+TEST_F(OpGluOutTest, AllInputFloatOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_glu_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpGluOutKernelTest, AllInputDoubleOutputSupport) {
+TEST_F(OpGluOutTest, AllInputDoubleOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_glu_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpGluOutKernelTest, InfinityAndNANTest) {
+TEST_F(OpGluOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Float> tf;
   const std::vector<int32_t> sizes = {4, 2};
   const std::vector<int32_t> out_sizes = {4, 1};
@@ -81,38 +142,7 @@ TEST(OpGluOutKernelTest, InfinityAndNANTest) {
           /*sizes=*/out_sizes, /*data=*/{INFINITY, -INFINITY, NAN, NAN}));
 }
 
-// Mismatched shape tests.
-template <ScalarType INPUT_DTYPE>
-void test_glu_out_mismatched_shape() {
-  TensorFactory<INPUT_DTYPE> tf_in;
-
-  // Input tensor and out tensor dimension size mismatch
-  Tensor in = tf_in.zeros(/*sizes=*/{4, 4, 4});
-  Tensor out = tf_in.zeros(/*sizes=*/{2, 4, 2});
-
-  ET_EXPECT_KERNEL_FAILURE(op_glu_out(in, 0, out));
-
-  out = tf_in.zeros(/*sizes=*/{4, 4, 4});
-  ET_EXPECT_KERNEL_FAILURE(op_glu_out(in, 0, out));
-}
-
-// Invalid dimensions tests.
-template <ScalarType INPUT_DTYPE>
-void test_glu_out_invalid_dim() {
-  TensorFactory<INPUT_DTYPE> tf_in;
-  Tensor in = tf_in.zeros(/*sizes=*/{2, 2});
-  const std::vector<int32_t> out_sizes = {1, 2};
-  Tensor out = tf_in.zeros(out_sizes);
-
-  // Dim is not valid
-  ET_EXPECT_KERNEL_FAILURE(op_glu_out(in, 3, out));
-
-  // Dim size is not even
-  in = tf_in.zeros(/*sizes=*/{3, 2});
-  ET_EXPECT_KERNEL_FAILURE(op_glu_out(in, 0, out));
-}
-
-TEST(OpGluOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpGluOutTest, MismatchedShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -122,55 +152,27 @@ TEST(OpGluOutKernelTest, MismatchedShapesDies) {
 #undef TEST_ENTRY
 }
 
-TEST(OpGluOutKernelTest, InvalidDimDies) {
+TEST_F(OpGluOutTest, InvalidDimDies) {
 #define TEST_ENTRY(ctype, dtype) test_glu_out_invalid_dim<ScalarType::dtype>();
   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-// Unhandled input dtypes.
-template <ScalarType INPUT_DTYPE>
-void test_div_invalid_input_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf_in;
-  TensorFactory<ScalarType::Float> tf_float;
-
-  const std::vector<int32_t> sizes = {2, 2};
-  const std::vector<int32_t> out_sizes = {1, 2};
-  Tensor in = tf_in.ones(sizes);
-  Tensor out = tf_float.zeros(out_sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_glu_out(in, 0, out));
-}
-
-TEST(OpGluOutKernelTest, AllNonFloatInputDTypeDies) {
+TEST_F(OpGluOutTest, AllNonFloatInputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_div_invalid_input_dtype_dies<ScalarType::dtype>();
   ET_FORALL_INT_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType OUTPUT_DTYPE>
-void test_div_invalid_output_dtype_dies() {
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 2};
-  const std::vector<int32_t> out_sizes = {1, 2};
-  Tensor in = tf_float.ones(sizes);
-  Tensor out = tf_out.zeros(out_sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_glu_out(in, 0, out));
-}
-
-TEST(OpGluOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpGluOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_div_invalid_output_dtype_dies<ScalarType::dtype>();
   ET_FORALL_INT_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpGluOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpGluOutTest, DynamicShapeUpperBoundSameAsExpected) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -197,7 +199,7 @@ TEST(OpGluOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpGluOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpGluOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -223,7 +225,7 @@ TEST(OpGluOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpGluOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpGluOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_gt_test.cpp b/kernels/test/op_gt_test.cpp
index 2631f72256..ae94e2109f 100644
--- a/kernels/test/op_gt_test.cpp
+++ b/kernels/test/op_gt_test.cpp
@@ -21,32 +21,49 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_gt_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::gt_outf(context, self, other, out);
-}
+class OpGtScalarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_gt_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
+    return torch::executor::aten::gt_outf(context_, self, other, out);
+  }
 
-Tensor& op_gt_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::gt_outf(context, self, other, out);
-}
+  template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
+  void test_gt_scalar_out() {
+    TensorFactory<DTYPE_IN> tf;
+    TensorFactory<DTYPE_OUT> tf_out;
 
-template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
-void test_gt_scalar_out() {
-  TensorFactory<DTYPE_IN> tf;
-  TensorFactory<DTYPE_OUT> tf_out;
+    const std::vector<int32_t> sizes = {2, 2};
+    Tensor out = tf_out.ones(sizes);
+    Scalar other = 2;
 
-  const std::vector<int32_t> sizes = {2, 2};
-  Tensor out = tf_out.ones(sizes);
-  Scalar other = 2;
+    // Valid input should give the expected output
+    op_gt_scalar_out(tf.make(sizes, /*data=*/{3, 1, 2, 4}), other, out);
+    EXPECT_TENSOR_EQ(
+        out, tf_out.make(sizes, /*data=*/{true, false, false, true}));
+  }
+};
 
-  // Valid input should give the expected output
-  op_gt_scalar_out(tf.make(sizes, /*data=*/{3, 1, 2, 4}), other, out);
-  EXPECT_TENSOR_EQ(
-      out, tf_out.make(sizes, /*data=*/{true, false, false, true}));
-}
+class OpGtTensorOutTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_gt_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::gt_outf(context_, self, other, out);
+  }
+
+  template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
+  void test_dtype() {
+    TensorFactory<DTYPE_IN> tf_input;
+    TensorFactory<DTYPE_OUT> tf_out;
+    Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
+    Tensor b = tf_input.make({2, 2}, {1, 4, 2, 3});
+    Tensor out = tf_out.zeros({2, 2});
+
+    op_gt_tensor_out(a, b, out);
+    EXPECT_TENSOR_EQ(out, tf_out.make({2, 2}, {true, false, false, true}));
+  }
+};
 
-TEST(OpGtScalarOutKernelTest, AllRealInputBoolOutputSupport) {
+TEST_F(OpGtScalarOutTest, AllRealInputBoolOutputSupport) {
 #define TEST_ENTRY(ctype_in, dtype_in, ctype_out, dtype_out) \
   test_gt_scalar_out<ScalarType::dtype_in, ScalarType::dtype_out>();
 
@@ -60,7 +77,7 @@ TEST(OpGtScalarOutKernelTest, AllRealInputBoolOutputSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpGtScalarOutKernelTest, BoolInputDtype) {
+TEST_F(OpGtScalarOutTest, BoolInputDtype) {
   TensorFactory<ScalarType::Bool> tf_bool;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -74,7 +91,7 @@ TEST(OpGtScalarOutKernelTest, BoolInputDtype) {
 }
 
 // Mismatched shape tests.
-TEST(OpGtScalarOutKernelTest, MismatchedInOutShapesDies) {
+TEST_F(OpGtScalarOutTest, MismatchedInOutShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -85,10 +102,10 @@ TEST(OpGtScalarOutKernelTest, MismatchedInOutShapesDies) {
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
   Scalar other = 3;
 
-  ET_EXPECT_KERNEL_FAILURE(op_gt_scalar_out(a, other, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_gt_scalar_out(a, other, out));
 }
 
-TEST(OpGtScalarOutKernelTest, DynamicOutShapeTest) {
+TEST_F(OpGtScalarOutTest, DynamicOutShapeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -103,19 +120,7 @@ TEST(OpGtScalarOutKernelTest, DynamicOutShapeTest) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{true, false, false, true}));
 }
 
-template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
-void test_dtype() {
-  TensorFactory<DTYPE_IN> tf_input;
-  TensorFactory<DTYPE_OUT> tf_out;
-  Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
-  Tensor b = tf_input.make({2, 2}, {1, 4, 2, 3});
-  Tensor out = tf_out.zeros({2, 2});
-
-  op_gt_tensor_out(a, b, out);
-  EXPECT_TENSOR_EQ(out, tf_out.make({2, 2}, {true, false, false, true}));
-}
-
-TEST(OpGtTensorOutKernelTest, AllDtypesSupported) {
+TEST_F(OpGtTensorOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype_in, dtype_in, ctype_out, dtype_out) \
   test_dtype<ScalarType::dtype_in, ScalarType::dtype_out>();
 
@@ -129,7 +134,7 @@ TEST(OpGtTensorOutKernelTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
-TEST(OpGtTensorOutKernelTest, MismatchedInShapesDies) {
+TEST_F(OpGtTensorOutTest, MismatchedInShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -140,10 +145,10 @@ TEST(OpGtTensorOutKernelTest, MismatchedInShapesDies) {
   Tensor b = tf_int.ones(/*sizes=*/{2, 2});
   Tensor out = tf_bool.ones(/*sizes=*/{4});
 
-  ET_EXPECT_KERNEL_FAILURE(op_gt_tensor_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_gt_tensor_out(a, b, out));
 }
 
-TEST(OpGtTensorOutKernelTest, MismatchedInOutShapesDies) {
+TEST_F(OpGtTensorOutTest, MismatchedInOutShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -154,10 +159,10 @@ TEST(OpGtTensorOutKernelTest, MismatchedInOutShapesDies) {
   Tensor b = tf_int.ones(/*sizes=*/{4});
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_gt_tensor_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_gt_tensor_out(a, b, out));
 }
 
-TEST(OpGtTensorOutKernelTest, DynamicOutShapeTest) {
+TEST_F(OpGtTensorOutTest, DynamicOutShapeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor a = tf.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
diff --git a/kernels/test/op_hardtanh_test.cpp b/kernels/test/op_hardtanh_test.cpp
index 71ee45eb38..bf790e432f 100644
--- a/kernels/test/op_hardtanh_test.cpp
+++ b/kernels/test/op_hardtanh_test.cpp
@@ -20,17 +20,19 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_hardtanh_out(
-    const Tensor& self,
-    const Scalar& min_val,
-    const Scalar& max_val,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::hardtanh_outf(
-      context, self, min_val, max_val, out);
-}
+class OpHardTanhTest : public OperatorTest {
+ protected:
+  Tensor& op_hardtanh_out(
+      const Tensor& self,
+      const Scalar& min_val,
+      const Scalar& max_val,
+      Tensor& out) {
+    return torch::executor::aten::hardtanh_outf(
+        context_, self, min_val, max_val, out);
+  }
+};
 
-TEST(OpHardTanhTest, SanityCheck) {
+TEST_F(OpHardTanhTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
   Tensor in = tf.ones({2, 2});
   Tensor out = tf.zeros({2, 2});
diff --git a/kernels/test/op_index_put_test.cpp b/kernels/test/op_index_put_test.cpp
index daf3687eee..b685edc6aa 100644
--- a/kernels/test/op_index_put_test.cpp
+++ b/kernels/test/op_index_put_test.cpp
@@ -25,57 +25,211 @@ using torch::executor::testing::TensorFactory;
 
 using OptTensorArrayRef = ArrayRef<optional<Tensor>>;
 
-Tensor& op_index_put_out(
-    const Tensor& input,
-    OptTensorArrayRef indices,
-    const Tensor& values,
-    const bool accumulate,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
+class OpIndexPutOutTest : public OperatorTest {
+ protected:
+  Tensor& op_index_put_out(
+      const Tensor& input,
+      OptTensorArrayRef indices,
+      const Tensor& values,
+      const bool accumulate,
+      Tensor& out) {
 #ifdef USE_ATEN_LIB
-  c10::List<c10::optional<at::Tensor>> indices_list(indices);
-  return torch::executor::aten::index_put_outf(
-      context, input, indices_list, values, accumulate, out);
+    c10::List<c10::optional<at::Tensor>> indices_list(indices);
+    return torch::executor::aten::index_put_outf(
+        context_, input, indices_list, values, accumulate, out);
 #else
-  return torch::executor::aten::index_put_outf(
-      context, input, indices, values, accumulate, out);
+    return torch::executor::aten::index_put_outf(
+        context_, input, indices, values, accumulate, out);
 #endif
-}
-
-namespace {
-
-// Run the test by putting values into the selected elements
-void run_test_cases(
-    const Tensor& x,
-    OptTensorArrayRef indices,
-    const Tensor& values,
-    const Tensor& expected,
-    const Tensor& expected_accum) {
-  // Generated out tensor sharing same size and dtype with expected tensor
-  TensorFactory<ScalarType::Double> tf;
+  }
 
-  const std::vector<int32_t> out_size(
-      expected.sizes().begin(), expected.sizes().end());
-  Tensor out = tf.ones(out_size);
+  template <
+      exec_aten::ScalarType INPUT_DTYPE,
+      exec_aten::ScalarType INDICES_DTYPE>
+  void test_dtype() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<INDICES_DTYPE> tfl;
+    TensorFactory<ScalarType::Bool> tfb;
+
+    // clang-format off
+    Tensor x = tf.make(
+        {3, 2, 4},
+        {
+          // [0, :, :]
+          1, 1, 1, 1, // [0, 0, :]
+          0, 0, 0, 0, // [0, 1, :]
 
-  Tensor ret = op_index_put_out(x, indices, values, /*accumulate=*/false, out);
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(ret, expected);
+          // [1, :, :]
+          1, 1, 1, 1, // [1, 0, :]
+          0, 0, 0, 0, // [1, 1, :]
+
+          // [2, :, :]
+          1, 1, 1, 1, // [2, 0, :]
+          0, 0, 0, 0, // [2, 1, :]
+        });
+    // clang-format on
+
+    // First, index_put to make everything equal to 1
+
+    // indices [0, 1, :], [1, 1, :], [2, 1, :]
+    optional<Tensor> indices[] = {
+        optional<Tensor>(tfl.make({1, 3}, {0, 1, 2})),
+        optional<Tensor>(tfl.make({1, 3}, {1, 1, 1})),
+    };
+    // bool representation of the same index list
+    optional<Tensor> indices_bool[] = {
+        optional<Tensor>(tfb.make({3}, {true, true, true})),
+        optional<Tensor>(tfb.make({2}, {false, true})),
+    };
+
+    Tensor values = tf.ones({3, 4});
+
+    std::vector<int32_t> out_size{3, 2, 4};
+
+    Tensor out = tf.zeros(out_size);
+    Tensor ret =
+        op_index_put_out(x, indices, values, /*accumulate=*/false, out);
+
+    EXPECT_TENSOR_EQ(ret, out);
+    EXPECT_TENSOR_EQ(ret, tf.ones(out_size));
+
+    // Repeat the test with bool indices
+    Tensor out_with_bool = tf.zeros(out_size);
+    Tensor ret_with_bool = op_index_put_out(
+        x, indices_bool, values, /*accumulate=*/false, out_with_bool);
+
+    EXPECT_TENSOR_EQ(ret_with_bool, out_with_bool);
+    EXPECT_TENSOR_EQ(ret_with_bool, tf.ones(out_size));
+
+    // Then, index_put to make everything equal to 0
+
+    // indices [0, 1, :], [1, 0, :], [2, 0, :]
+    optional<Tensor> indices_alt[] = {
+        optional<Tensor>(tfl.make({1, 3}, {0, 1, 2})),
+        optional<Tensor>(tfl.make({1, 3}, {0, 0, 0})),
+    };
+    // bool representation of the same index list
+    optional<Tensor> indices_alt_bool[] = {
+        optional<Tensor>(tfb.make({3}, {true, true, true})),
+        optional<Tensor>(tfb.make({2}, {true, false})),
+    };
+
+    Tensor values_alt = tf.zeros({3, 4});
+
+    Tensor out_alt = tf.ones(out_size);
+    Tensor ret_alt = op_index_put_out(
+        x, indices_alt, values_alt, /*accumulate=*/false, out_alt);
+
+    EXPECT_TENSOR_EQ(ret_alt, out_alt);
+    EXPECT_TENSOR_EQ(ret_alt, tf.zeros(out_size));
+
+    // Repeat the test with bool indices
+    Tensor out_alt_with_bool = tf.ones(out_size);
+    Tensor ret_alt_with_bool = op_index_put_out(
+        x,
+        indices_alt_bool,
+        values_alt,
+        /*accumulate=*/false,
+        out_alt_with_bool);
+
+    EXPECT_TENSOR_EQ(ret_alt_with_bool, out_alt_with_bool);
+    EXPECT_TENSOR_EQ(ret_alt_with_bool, tf.zeros(out_size));
+  }
 
-  Tensor out_accum = tf.ones(out_size);
-  Tensor ret_accum =
-      op_index_put_out(x, indices, values, /*accumulate=*/true, out_accum);
-  EXPECT_TENSOR_EQ(out_accum, ret_accum);
-  EXPECT_TENSOR_EQ(ret_accum, expected_accum);
-}
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  input = torch.rand(2, 3, 4)
+  indices = [torch.tensor([1]), torch.tensor([0]), torch.tensor([1, 2])]
+  values = torch.rand(2)
+  accumulate = False
+  expected = input.index_put(indices, values, accumulate=accumulate)
+
+  index_put_template = f"""
+    {declare_tensor_factory("ScalarType::Float", "tf")}
+    {declare_tensor_factory("ScalarType::Long", "tf_indices")}
+
+    {declare_tensor_make_t("input", "tf")}
+    {declare_optional_tensor_list_make_t("indices", "tf_indices")}
+    {declare_tensor_make_t("values", "tf")}
+    {declare_tensor_make_t("expected", "tf")}
+    {declare_tensor_zeros("out_shape, dynamism", "tf", "out")}
+
+    op_index_put_out(input, indices, values, $accumulate$, out);
+    EXPECT_TENSOR_EQ(out, expected);"""
+  */
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(index_put_template) */
+
+    TensorFactory<ScalarType::Float> tf;
+    TensorFactory<ScalarType::Long> tf_indices;
+
+    Tensor input = tf.make(
+        {2, 3, 4},
+        {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
+         0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
+         0.4900934100151062,   0.8964447379112244,  0.455627977848053,
+         0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
+         0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
+         0.518521785736084,    0.6976675987243652,  0.800011396408081,
+         0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
+         0.9151939749717712,   0.39709991216659546, 0.8741558790206909});
+    optional<Tensor> indices[] = {
+        optional<Tensor>(tf_indices.make({1}, {1})),
+        optional<Tensor>(tf_indices.make({1}, {0})),
+        optional<Tensor>(tf_indices.make({2}, {1, 2}))};
+    Tensor values = tf.make({2}, {0.41940832138061523, 0.5529070496559143});
+    Tensor expected = tf.make(
+        {2, 3, 4},
+        {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
+         0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
+         0.4900934100151062,   0.8964447379112244,  0.455627977848053,
+         0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
+         0.022325754165649414, 0.41940832138061523, 0.5529070496559143,
+         0.518521785736084,    0.6976675987243652,  0.800011396408081,
+         0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
+         0.9151939749717712,   0.39709991216659546, 0.8741558790206909});
+    Tensor out = tf.zeros(out_shape, dynamism);
+
+    op_index_put_out(input, indices, values, false, out);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
 
-} // namespace
+  // Run the test by putting values into the selected elements
+  void run_test_cases(
+      const Tensor& x,
+      OptTensorArrayRef indices,
+      const Tensor& values,
+      const Tensor& expected,
+      const Tensor& expected_accum) {
+    // Generated out tensor sharing same size and dtype with expected tensor
+    TensorFactory<ScalarType::Double> tf;
+
+    const std::vector<int32_t> out_size(
+        expected.sizes().begin(), expected.sizes().end());
+    Tensor out = tf.ones(out_size);
+
+    Tensor ret =
+        op_index_put_out(x, indices, values, /*accumulate=*/false, out);
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(ret, expected);
+
+    Tensor out_accum = tf.ones(out_size);
+    Tensor ret_accum =
+        op_index_put_out(x, indices, values, /*accumulate=*/true, out_accum);
+    EXPECT_TENSOR_EQ(out_accum, ret_accum);
+    EXPECT_TENSOR_EQ(ret_accum, expected_accum);
+  }
+};
 
 //
 // Correctness Tests
 //
 
-TEST(OpIndexPutOutTest, IndexPutMask) {
+TEST_F(OpIndexPutOutTest, IndexPutMask) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Bool> tfb;
   // clang-format off
@@ -152,7 +306,7 @@ TEST(OpIndexPutOutTest, IndexPutMask) {
   run_test_cases(x, {indices}, values, expected, expected_accum);
 }
 
-TEST(OpIndexPutOutTest, IndexPutMaskBroadcast) {
+TEST_F(OpIndexPutOutTest, IndexPutMaskBroadcast) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Bool> tfb;
   // clang-format off
@@ -233,7 +387,7 @@ TEST(OpIndexPutOutTest, IndexPutMaskBroadcast) {
   run_test_cases(x, {indices}, values, expected, expected_accum);
 }
 
-TEST(OpIndexPutOutTest, PutFrontDimAllIndexes) {
+TEST_F(OpIndexPutOutTest, PutFrontDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Int> tfi;
   TensorFactory<ScalarType::Long> tfl;
@@ -329,7 +483,7 @@ TEST(OpIndexPutOutTest, PutFrontDimAllIndexes) {
   run_test_cases(x, indices_mixed, values, expected, expected_accum);
 }
 
-TEST(OpIndexPutOutTest, PutTwoValuesAtSameIndex) {
+TEST_F(OpIndexPutOutTest, PutTwoValuesAtSameIndex) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
@@ -396,7 +550,7 @@ TEST(OpIndexPutOutTest, PutTwoValuesAtSameIndex) {
   run_test_cases(x, /*indices=*/indices, values, expected, expected_accum);
 }
 
-TEST(OpIndexPutOutTest, IndicesFewerThanInputDimSupported) {
+TEST_F(OpIndexPutOutTest, IndicesFewerThanInputDimSupported) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Int> tfi;
   TensorFactory<ScalarType::Long> tfl;
@@ -475,7 +629,7 @@ TEST(OpIndexPutOutTest, IndicesFewerThanInputDimSupported) {
   run_test_cases(x, indices_mixed, values, expected, expected_accum);
 }
 
-TEST(OpIndexPutOutTest, IndicesFewerThanInputDimSupportedSameValue) {
+TEST_F(OpIndexPutOutTest, IndicesFewerThanInputDimSupportedSameValue) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
@@ -549,95 +703,7 @@ TEST(OpIndexPutOutTest, IndicesFewerThanInputDimSupportedSameValue) {
 /**
  * Generic test for integral index lists
  */
-template <
-    exec_aten::ScalarType INPUT_DTYPE,
-    exec_aten::ScalarType INDICES_DTYPE>
-void test_dtype() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<INDICES_DTYPE> tfl;
-  TensorFactory<ScalarType::Bool> tfb;
-
-  // clang-format off
-  Tensor x = tf.make(
-      {3, 2, 4},
-      {
-        // [0, :, :]
-        1, 1, 1, 1, // [0, 0, :]
-        0, 0, 0, 0, // [0, 1, :]
-
-        // [1, :, :]
-        1, 1, 1, 1, // [1, 0, :]
-        0, 0, 0, 0, // [1, 1, :]
-
-        // [2, :, :]
-        1, 1, 1, 1, // [2, 0, :]
-        0, 0, 0, 0, // [2, 1, :]
-      });
-  // clang-format on
-
-  // First, index_put to make everything equal to 1
-
-  // indices [0, 1, :], [1, 1, :], [2, 1, :]
-  optional<Tensor> indices[] = {
-      optional<Tensor>(tfl.make({1, 3}, {0, 1, 2})),
-      optional<Tensor>(tfl.make({1, 3}, {1, 1, 1})),
-  };
-  // bool representation of the same index list
-  optional<Tensor> indices_bool[] = {
-      optional<Tensor>(tfb.make({3}, {true, true, true})),
-      optional<Tensor>(tfb.make({2}, {false, true})),
-  };
-
-  Tensor values = tf.ones({3, 4});
-
-  std::vector<int32_t> out_size{3, 2, 4};
-
-  Tensor out = tf.zeros(out_size);
-  Tensor ret = op_index_put_out(x, indices, values, /*accumulate=*/false, out);
-
-  EXPECT_TENSOR_EQ(ret, out);
-  EXPECT_TENSOR_EQ(ret, tf.ones(out_size));
-
-  // Repeat the test with bool indices
-  Tensor out_with_bool = tf.zeros(out_size);
-  Tensor ret_with_bool = op_index_put_out(
-      x, indices_bool, values, /*accumulate=*/false, out_with_bool);
-
-  EXPECT_TENSOR_EQ(ret_with_bool, out_with_bool);
-  EXPECT_TENSOR_EQ(ret_with_bool, tf.ones(out_size));
-
-  // Then, index_put to make everything equal to 0
-
-  // indices [0, 1, :], [1, 0, :], [2, 0, :]
-  optional<Tensor> indices_alt[] = {
-      optional<Tensor>(tfl.make({1, 3}, {0, 1, 2})),
-      optional<Tensor>(tfl.make({1, 3}, {0, 0, 0})),
-  };
-  // bool representation of the same index list
-  optional<Tensor> indices_alt_bool[] = {
-      optional<Tensor>(tfb.make({3}, {true, true, true})),
-      optional<Tensor>(tfb.make({2}, {true, false})),
-  };
-
-  Tensor values_alt = tf.zeros({3, 4});
-
-  Tensor out_alt = tf.ones(out_size);
-  Tensor ret_alt = op_index_put_out(
-      x, indices_alt, values_alt, /*accumulate=*/false, out_alt);
-
-  EXPECT_TENSOR_EQ(ret_alt, out_alt);
-  EXPECT_TENSOR_EQ(ret_alt, tf.zeros(out_size));
-
-  // Repeat the test with bool indices
-  Tensor out_alt_with_bool = tf.ones(out_size);
-  Tensor ret_alt_with_bool = op_index_put_out(
-      x, indices_alt_bool, values_alt, /*accumulate=*/false, out_alt_with_bool);
-
-  EXPECT_TENSOR_EQ(ret_alt_with_bool, out_alt_with_bool);
-  EXPECT_TENSOR_EQ(ret_alt_with_bool, tf.zeros(out_size));
-}
-
-TEST(OpIndexPutOutTest, AllDtypesSupportedForInput) {
+TEST_F(OpIndexPutOutTest, AllDtypesSupportedForInput) {
 #define TEST_ENTRY(ctype, dtype) \
   test_dtype<ScalarType::dtype, ScalarType::Long>();
 
@@ -646,7 +712,7 @@ TEST(OpIndexPutOutTest, AllDtypesSupportedForInput) {
 #undef TEST_ENTRY
 }
 
-TEST(OpIndexPutOutTest, AllDtypesSupportedForIndicesList) {
+TEST_F(OpIndexPutOutTest, AllDtypesSupportedForIndicesList) {
   test_dtype<ScalarType::Float, ScalarType::Long>();
   test_dtype<ScalarType::Float, ScalarType::Int>();
 }
@@ -655,7 +721,7 @@ TEST(OpIndexPutOutTest, AllDtypesSupportedForIndicesList) {
 // Death Tests
 //
 
-TEST(OpIndexPutOutTest, IndexOutOfBoundDies) {
+TEST_F(OpIndexPutOutTest, IndexOutOfBoundDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -671,12 +737,13 @@ TEST(OpIndexPutOutTest, IndexOutOfBoundDies) {
   // clang-format on
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_,
       op_index_put_out(
           x, /*indices=*/{index}, values, /*accumulate=*/false, out),
       "");
 }
 
-TEST(OpIndexPutOutTest, NegativeIndexOutOfBoundDies) {
+TEST_F(OpIndexPutOutTest, NegativeIndexOutOfBoundDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -692,12 +759,13 @@ TEST(OpIndexPutOutTest, NegativeIndexOutOfBoundDies) {
   // clang-format on
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_,
       op_index_put_out(
           x, /*indices=*/{index}, values, /*accumulate=*/false, out),
       "");
 }
 
-TEST(OpIndexPutOutTest, TooManyBooleanIndexCountDies) {
+TEST_F(OpIndexPutOutTest, TooManyBooleanIndexCountDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -713,12 +781,13 @@ TEST(OpIndexPutOutTest, TooManyBooleanIndexCountDies) {
   // clang-format on
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_,
       op_index_put_out(
           x, /*indices=*/{index}, values, /*accumulate=*/false, out),
       "");
 }
 
-TEST(OpIndexPutOutTest, TooFewBooleanIndexCountDies) {
+TEST_F(OpIndexPutOutTest, TooFewBooleanIndexCountDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -735,12 +804,13 @@ TEST(OpIndexPutOutTest, TooFewBooleanIndexCountDies) {
 
   // ATen kernel will throw exception instead of death
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_,
       op_index_put_out(
           x, /*indices=*/{index}, values, /*accumulate=*/false, out),
       "");
 }
 
-TEST(OpIndexPutOutTest, MismatchedIndexMaskDies) {
+TEST_F(OpIndexPutOutTest, MismatchedIndexMaskDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -757,12 +827,13 @@ TEST(OpIndexPutOutTest, MismatchedIndexMaskDies) {
 
   // ATen kernel will throw exception instead of death
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_,
       op_index_put_out(
           x, /*indices=*/{index}, values, /*accumulate=*/false, out),
       "");
 }
 
-TEST(OpIndexPutOutTest, MismatchedOutputDtypesDies) {
+TEST_F(OpIndexPutOutTest, MismatchedOutputDtypesDies) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Double> tf_double;
   TensorFactory<ScalarType::Long> tf_long;
@@ -781,12 +852,13 @@ TEST(OpIndexPutOutTest, MismatchedOutputDtypesDies) {
   // clang-format on
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_,
       op_index_put_out(
           x, /*indices=*/{index}, values, /*accumulate=*/false, out),
       "");
 }
 
-TEST(OpIndexPutOutTest, MismatchedValuesDtypesDies) {
+TEST_F(OpIndexPutOutTest, MismatchedValuesDtypesDies) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Double> tf_double;
   TensorFactory<ScalarType::Long> tf_long;
@@ -805,12 +877,13 @@ TEST(OpIndexPutOutTest, MismatchedValuesDtypesDies) {
   // clang-format on
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_,
       op_index_put_out(
           x, /*indices=*/{index}, values, /*accumulate=*/false, out),
       "");
 }
 
-TEST(OpIndexPutOutTest, ValuesSizeMismatchDimDies) {
+TEST_F(OpIndexPutOutTest, ValuesSizeMismatchDimDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -827,12 +900,13 @@ TEST(OpIndexPutOutTest, ValuesSizeMismatchDimDies) {
   // clang-format on
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_,
       op_index_put_out(
           x, /*indices=*/{index}, values, /*accumulate=*/false, out),
       "");
 }
 
-TEST(OpIndexPutOutTest, InvalidIndicesDtypeDies) {
+TEST_F(OpIndexPutOutTest, InvalidIndicesDtypeDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Float> tff;
 
@@ -853,10 +927,12 @@ TEST(OpIndexPutOutTest, InvalidIndicesDtypeDies) {
   // clang-format on
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
-      op_index_put_out(x, indices, values, /*accumulate=*/false, out), "");
+      context_,
+      op_index_put_out(x, indices, values, /*accumulate=*/false, out),
+      "");
 }
 
-TEST(OpIndexPutOutTest, InvalidIndicesShapesDies) {
+TEST_F(OpIndexPutOutTest, InvalidIndicesShapesDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -877,10 +953,12 @@ TEST(OpIndexPutOutTest, InvalidIndicesShapesDies) {
   // clang-format on
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
-      op_index_put_out(x, indices, values, /*accumulate=*/false, out), "");
+      context_,
+      op_index_put_out(x, indices, values, /*accumulate=*/false, out),
+      "");
 }
 
-TEST(OpIndexPutOutTest, NonLinearIndices) {
+TEST_F(OpIndexPutOutTest, NonLinearIndices) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -913,75 +991,12 @@ TEST(OpIndexPutOutTest, NonLinearIndices) {
 // Dynamic Shape Tests
 //
 
-/* %python
-import torch
-torch.manual_seed(0)
-input = torch.rand(2, 3, 4)
-indices = [torch.tensor([1]), torch.tensor([0]), torch.tensor([1, 2])]
-values = torch.rand(2)
-accumulate = False
-expected = input.index_put(indices, values, accumulate=accumulate)
-
-index_put_template = f"""
-  {declare_tensor_factory("ScalarType::Float", "tf")}
-  {declare_tensor_factory("ScalarType::Long", "tf_indices")}
-
-  {declare_tensor_make_t("input", "tf")}
-  {declare_optional_tensor_list_make_t("indices", "tf_indices")}
-  {declare_tensor_make_t("values", "tf")}
-  {declare_tensor_make_t("expected", "tf")}
-  {declare_tensor_zeros("out_shape, dynamism", "tf", "out")}
-
-  op_index_put_out(input, indices, values, $accumulate$, out);
-  EXPECT_TENSOR_EQ(out, expected);"""
-*/
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(index_put_template) */
-
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<ScalarType::Long> tf_indices;
-
-  Tensor input = tf.make(
-      {2, 3, 4},
-      {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
-       0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
-       0.4900934100151062,   0.8964447379112244,  0.455627977848053,
-       0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
-       0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
-       0.518521785736084,    0.6976675987243652,  0.800011396408081,
-       0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
-       0.9151939749717712,   0.39709991216659546, 0.8741558790206909});
-  optional<Tensor> indices[] = {
-      optional<Tensor>(tf_indices.make({1}, {1})),
-      optional<Tensor>(tf_indices.make({1}, {0})),
-      optional<Tensor>(tf_indices.make({2}, {1, 2}))};
-  Tensor values = tf.make({2}, {0.41940832138061523, 0.5529070496559143});
-  Tensor expected = tf.make(
-      {2, 3, 4},
-      {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
-       0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
-       0.4900934100151062,   0.8964447379112244,  0.455627977848053,
-       0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
-       0.022325754165649414, 0.41940832138061523, 0.5529070496559143,
-       0.518521785736084,    0.6976675987243652,  0.800011396408081,
-       0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
-       0.9151939749717712,   0.39709991216659546, 0.8741558790206909});
-  Tensor out = tf.zeros(out_shape, dynamism);
-
-  op_index_put_out(input, indices, values, false, out);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpIndexPutOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpIndexPutOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpIndexPutOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpIndexPutOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -989,7 +1004,7 @@ TEST(OpIndexPutOutTest, DynamicShapeUpperBoundLargerThanExpected) {
       {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpIndexPutOutTest, DynamicShapeUnbound) {
+TEST_F(OpIndexPutOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_index_select_test.cpp b/kernels/test/op_index_select_test.cpp
index bfbf5d6033..279335f0ea 100644
--- a/kernels/test/op_index_select_test.cpp
+++ b/kernels/test/op_index_select_test.cpp
@@ -23,39 +23,124 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_index_select_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::index_select_outf(
-      context, self, dim, index, out);
-}
+class OpIndexSelectOutTest : public OperatorTest {
+ protected:
+  Tensor& op_index_select_out(
+      const Tensor& self,
+      int64_t dim,
+      const Tensor& index,
+      Tensor& out) {
+    return torch::executor::aten::index_select_outf(
+        context_, self, dim, index, out);
+  }
 
-namespace {
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<ScalarType::Long> tfl;
 
-// Run the test by selecting Tensor x on given dim and all available indexes on
-// that dimension
-void run_test_cases(
-    const Tensor& x,
-    ssize_t dim,
-    const Tensor& index,
-    const Tensor& expected) {
-  // Generated out tensor sharing same size and dtype with expected tensor
-  TensorFactory<ScalarType::Double> tf;
+    // test index_select on dimension 0.
 
-  const std::vector<int32_t> out_size(
-      expected.sizes().begin(), expected.sizes().end());
-  Tensor out = tf.ones(out_size);
+    // clang-format off
+    Tensor x = tf.make(
+        {3, 2, 4},
+        {
+          // all ones below are from x,
+          // and all zeros are from y.
+          // [0, :, :]
+          1, 1, 1, 1, // [0, 0, :]
+          0, 0, 0, 0, // [0, 1, :]
 
-  Tensor ret = op_index_select_out(x, dim, index, out);
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(ret, expected);
-}
-} // namespace
+          // [1, :, :]
+          1, 1, 1, 1, // [1, 0, :]
+          0, 0, 0, 0, // [1, 1, :]
+
+          // [2, :, :]
+          1, 1, 1, 1, // [2, 0, :]
+          0, 0, 0, 0, // [2, 1, :]
+        });
+    // clang-format on
+
+    // Expected values for out_0 and ret_0 after the test are all ones(3, 4)
+    // based on the above rules. So here we set the default value of out_0 as
+    // zeros(3, 4) on purpose, to eliminate the influence to the final result
+    // from initial value. Same for out_1 and ret_1.
+
+    Tensor out_0 = tf.zeros({3, 1, 4});
+    Tensor out_1 = tf.ones({3, 1, 4});
+    Tensor index_0 = tfl.make({1}, {0});
+    Tensor index_1 = tfl.make({1}, {1});
+    Tensor ret_0 = op_index_select_out(x, /*dim=*/1, /*index=*/index_0, out_0);
+    Tensor ret_1 = op_index_select_out(x, /*dim=*/1, /*index=*/index_1, out_1);
+
+    EXPECT_TENSOR_EQ(ret_0, out_0);
+    EXPECT_TENSOR_EQ(ret_1, out_1);
+
+    EXPECT_TENSOR_EQ(ret_0, tf.ones({3, 1, 4}));
+    EXPECT_TENSOR_EQ(ret_1, tf.zeros({3, 1, 4}));
+  }
 
-TEST(OpIndexSelectOutTest, SelectFrontDimAllIndexes) {
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(index_select_template) */
+
+    TensorFactory<ScalarType::Float> tf;
+    TensorFactory<ScalarType::Long> tf_index;
+
+    Tensor input = tf.make(
+        {2, 3, 4},
+        {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
+         0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
+         0.4900934100151062,   0.8964447379112244,  0.455627977848053,
+         0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
+         0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
+         0.518521785736084,    0.6976675987243652,  0.800011396408081,
+         0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
+         0.9151939749717712,   0.39709991216659546, 0.8741558790206909});
+    Tensor index = tf_index.make({2}, {0, 2});
+    Tensor expected = tf.make(
+        {2, 3, 2},
+        {0.49625658988952637,
+         0.08847743272781372,
+         0.30742281675338745,
+         0.4900934100151062,
+         0.455627977848053,
+         0.3488934636116028,
+         0.022325754165649414,
+         0.2938884496688843,
+         0.6976675987243652,
+         0.16102945804595947,
+         0.6816085577011108,
+         0.39709991216659546});
+    Tensor out = tf.zeros(out_shape, dynamism);
+
+    op_index_select_out(input, 2, index, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+
+  // Run the test by selecting Tensor x on given dim and all available indexes
+  // on that dimension
+  void run_test_cases(
+      const Tensor& x,
+      ssize_t dim,
+      const Tensor& index,
+      const Tensor& expected) {
+    // Generated out tensor sharing same size and dtype with expected tensor
+    TensorFactory<ScalarType::Double> tf;
+
+    const std::vector<int32_t> out_size(
+        expected.sizes().begin(), expected.sizes().end());
+    Tensor out = tf.ones(out_size);
+
+    Tensor ret = op_index_select_out(x, dim, index, out);
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(ret, expected);
+  }
+};
+
+TEST_F(OpIndexSelectOutTest, SelectFrontDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
@@ -93,7 +178,7 @@ TEST(OpIndexSelectOutTest, SelectFrontDimAllIndexes) {
   run_test_cases(x, /*dim=*/0, /*index=*/index, expected);
 }
 
-TEST(OpIndexSelectOutTest, SelectMiddleDimAllIndexes) {
+TEST_F(OpIndexSelectOutTest, SelectMiddleDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
@@ -133,7 +218,7 @@ TEST(OpIndexSelectOutTest, SelectMiddleDimAllIndexes) {
   run_test_cases(x, /*dim=*/1, /*index=*/index, expected);
 }
 
-TEST(OpIndexSelectOutTest, SelectEndDimAllIndexes) {
+TEST_F(OpIndexSelectOutTest, SelectEndDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
@@ -178,53 +263,7 @@ TEST(OpIndexSelectOutTest, SelectEndDimAllIndexes) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<ScalarType::Long> tfl;
-
-  // test index_select on dimension 0.
-
-  // clang-format off
-  Tensor x = tf.make(
-      {3, 2, 4},
-      {
-        // all ones below are from x,
-        // and all zeros are from y.
-        // [0, :, :]
-        1, 1, 1, 1, // [0, 0, :]
-        0, 0, 0, 0, // [0, 1, :]
-
-        // [1, :, :]
-        1, 1, 1, 1, // [1, 0, :]
-        0, 0, 0, 0, // [1, 1, :]
-
-        // [2, :, :]
-        1, 1, 1, 1, // [2, 0, :]
-        0, 0, 0, 0, // [2, 1, :]
-      });
-  // clang-format on
-
-  // Expected values for out_0 and ret_0 after the test are all ones(3, 4) based
-  // on the above rules. So here we set the default value of out_0 as zeros(3,
-  // 4) on purpose, to eliminate the influence to the final result from initial
-  // value. Same for out_1 and ret_1.
-
-  Tensor out_0 = tf.zeros({3, 1, 4});
-  Tensor out_1 = tf.ones({3, 1, 4});
-  Tensor index_0 = tfl.make({1}, {0});
-  Tensor index_1 = tfl.make({1}, {1});
-  Tensor ret_0 = op_index_select_out(x, /*dim=*/1, /*index=*/index_0, out_0);
-  Tensor ret_1 = op_index_select_out(x, /*dim=*/1, /*index=*/index_1, out_1);
-
-  EXPECT_TENSOR_EQ(ret_0, out_0);
-  EXPECT_TENSOR_EQ(ret_1, out_1);
-
-  EXPECT_TENSOR_EQ(ret_0, tf.ones({3, 1, 4}));
-  EXPECT_TENSOR_EQ(ret_1, tf.zeros({3, 1, 4}));
-}
-
-TEST(OpIndexSelectOutTest, AllDtypesSupported) {
+TEST_F(OpIndexSelectOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
@@ -242,7 +281,7 @@ TEST(OpIndexSelectOutTest, AllDtypesSupported) {
 
 // In this test we are gonnna find if our select function support non-empty
 // tensor input and empty-size tensor output.
-TEST(OpIndexSelectOutTest, NonEmptyInputEmptyOutputWithMismatchDimDies) {
+TEST_F(OpIndexSelectOutTest, NonEmptyInputEmptyOutputWithMismatchDimDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -258,12 +297,12 @@ TEST(OpIndexSelectOutTest, NonEmptyInputEmptyOutputWithMismatchDimDies) {
   // pass the empty-size tensor to the function,
   Tensor expect = tf.make({}, {5});
   ET_EXPECT_KERNEL_FAILURE(
-      op_index_select_out(x, /*dim=*/0, /*index=*/index, out));
+      context_, op_index_select_out(x, /*dim=*/0, /*index=*/index, out));
 }
 
 // This test focuses on the support for empty tensor (dim() > 0) input and empty
 // tensor output
-TEST(OpIndexSelectOutTest, EmptyInputEmptyOutputWithMatchingDimSupported) {
+TEST_F(OpIndexSelectOutTest, EmptyInputEmptyOutputWithMatchingDimSupported) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -285,7 +324,7 @@ TEST(OpIndexSelectOutTest, EmptyInputEmptyOutputWithMatchingDimSupported) {
 
 ///////////////////////////////////////////////////////////////////////
 
-TEST(OpIndexSelectOutTest, DimOutOfBoundDies) {
+TEST_F(OpIndexSelectOutTest, DimOutOfBoundDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -296,11 +335,12 @@ TEST(OpIndexSelectOutTest, DimOutOfBoundDies) {
   // Some invalid dim values.
   const std::vector<int32_t> invalid_dims = {3, 4, 5, -4, -5, -6};
   for (ssize_t dim : invalid_dims) {
-    ET_EXPECT_KERNEL_FAILURE(op_index_select_out(x, dim, /*index=*/index, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_index_select_out(x, dim, /*index=*/index, out));
   }
 }
 
-TEST(OpIndexSelectOutTest, MismatchedDtypesDies) {
+TEST_F(OpIndexSelectOutTest, MismatchedDtypesDies) {
   TensorFactory<ScalarType::Int> tf_int;
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
@@ -312,10 +352,10 @@ TEST(OpIndexSelectOutTest, MismatchedDtypesDies) {
   Tensor index = tf_long.make({1}, {0});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_index_select_out(x, /*dim=*/0, /*index=*/index, out));
+      context_, op_index_select_out(x, /*dim=*/0, /*index=*/index, out));
 }
 
-TEST(OpIndexSelectOutTest, OutMatchNumelLackDimAtEndDies) {
+TEST_F(OpIndexSelectOutTest, OutMatchNumelLackDimAtEndDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -330,10 +370,10 @@ TEST(OpIndexSelectOutTest, OutMatchNumelLackDimAtEndDies) {
   Tensor out = tf.ones({1, 2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_index_select_out(x, /*dim=*/0, /*index=*/index, out));
+      context_, op_index_select_out(x, /*dim=*/0, /*index=*/index, out));
 }
 
-TEST(OpIndexSelectOutTest, OutMatchNumelExtraDimAtFrontDies) {
+TEST_F(OpIndexSelectOutTest, OutMatchNumelExtraDimAtFrontDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -348,10 +388,10 @@ TEST(OpIndexSelectOutTest, OutMatchNumelExtraDimAtFrontDies) {
   Tensor out = tf.ones({1, 1, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_index_select_out(x, /*dim=*/0, /*index=*/index, out));
+      context_, op_index_select_out(x, /*dim=*/0, /*index=*/index, out));
 }
 
-TEST(OpIndexSelectOutTest, OutSizeMismatchDimDies) {
+TEST_F(OpIndexSelectOutTest, OutSizeMismatchDimDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -366,10 +406,10 @@ TEST(OpIndexSelectOutTest, OutSizeMismatchDimDies) {
   Tensor out = tf.zeros({2, 4, 7});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_index_select_out(x, /*dim=*/2, /*index=*/index, out));
+      context_, op_index_select_out(x, /*dim=*/2, /*index=*/index, out));
 }
 
-TEST(OpIndexSelectOutTest, IndexWithInvalidDtypeDies) {
+TEST_F(OpIndexSelectOutTest, IndexWithInvalidDtypeDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Float> tff;
 
@@ -379,10 +419,10 @@ TEST(OpIndexSelectOutTest, IndexWithInvalidDtypeDies) {
   Tensor out = tf.zeros({2, 1, 7, 5});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_index_select_out(x, /*dim=*/1, /*index=*/index, out));
+      context_, op_index_select_out(x, /*dim=*/1, /*index=*/index, out));
 }
 
-TEST(OpIndexSelectOutTest, IndexWithInvalidDimDies) {
+TEST_F(OpIndexSelectOutTest, IndexWithInvalidDimDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -393,11 +433,11 @@ TEST(OpIndexSelectOutTest, IndexWithInvalidDimDies) {
   Tensor out = tf.zeros({2, 1, 7, 5});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_index_select_out(x, /*dim=*/1, /*index=*/index, out));
+      context_, op_index_select_out(x, /*dim=*/1, /*index=*/index, out));
 }
 
 #if !defined(USE_ATEN_LIB)
-TEST(OpIndexSelectOutTest, UpperBoundOutTensor) {
+TEST_F(OpIndexSelectOutTest, UpperBoundOutTensor) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
@@ -459,52 +499,12 @@ index_select_template = f"""
   op_index_select_out(input, $dim$, index, out);
   EXPECT_TENSOR_CLOSE(out, expected);""" */
 
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(index_select_template) */
-
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<ScalarType::Long> tf_index;
-
-  Tensor input = tf.make(
-      {2, 3, 4},
-      {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
-       0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
-       0.4900934100151062,   0.8964447379112244,  0.455627977848053,
-       0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
-       0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
-       0.518521785736084,    0.6976675987243652,  0.800011396408081,
-       0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
-       0.9151939749717712,   0.39709991216659546, 0.8741558790206909});
-  Tensor index = tf_index.make({2}, {0, 2});
-  Tensor expected = tf.make(
-      {2, 3, 2},
-      {0.49625658988952637,
-       0.08847743272781372,
-       0.30742281675338745,
-       0.4900934100151062,
-       0.455627977848053,
-       0.3488934636116028,
-       0.022325754165649414,
-       0.2938884496688843,
-       0.6976675987243652,
-       0.16102945804595947,
-       0.6816085577011108,
-       0.39709991216659546});
-  Tensor out = tf.zeros(out_shape, dynamism);
-
-  op_index_select_out(input, 2, index, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
-TEST(OpIndexSelectOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpIndexSelectOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpIndexSelectOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpIndexSelectOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -512,7 +512,7 @@ TEST(OpIndexSelectOutTest, DynamicShapeUpperBoundLargerThanExpected) {
       {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpIndexSelectOutTest, DynamicShapeUnbound) {
+TEST_F(OpIndexSelectOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_index_test.cpp b/kernels/test/op_index_test.cpp
index 362671b461..03a91005e8 100644
--- a/kernels/test/op_index_test.cpp
+++ b/kernels/test/op_index_test.cpp
@@ -26,44 +26,115 @@ using torch::executor::testing::TensorFactory;
 
 using OptTensorArrayRef = ArrayRef<optional<Tensor>>;
 
-Tensor& op_index_tensor_out(
-    const Tensor& input,
-    OptTensorArrayRef indices,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
+class OpIndexTensorOutTest : public OperatorTest {
+ protected:
+  Tensor& op_index_tensor_out(
+      const Tensor& input,
+      OptTensorArrayRef indices,
+      Tensor& out) {
 #ifdef USE_ATEN_LIB
-  c10::List<c10::optional<at::Tensor>> indices_list(indices);
-  return torch::executor::aten::index_outf(context, input, indices_list, out);
+    c10::List<c10::optional<at::Tensor>> indices_list(indices);
+    return torch::executor::aten::index_outf(
+        context_, input, indices_list, out);
 #else
-  return torch::executor::aten::index_outf(context, input, indices, out);
+    return torch::executor::aten::index_outf(context_, input, indices, out);
 #endif
-}
+  }
 
-namespace {
+  template <
+      exec_aten::ScalarType INPUT_DTYPE,
+      exec_aten::ScalarType INDEX_DTYPE,
+      exec_aten::ScalarType OUTPUT_DTYPE>
+  void test_dtype() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<INDEX_DTYPE> tfl;
+    TensorFactory<OUTPUT_DTYPE> tfo;
+    TensorFactory<ScalarType::Bool> tfb;
+
+    // clang-format off
+    Tensor x = tf.make(
+        {3, 2, 4},
+        {
+          // all ones below are from x,
+          // and all zeros are from y.
+          // [0, :, :]
+          1, 1, 1, 1, // [0, 0, :]
+          0, 0, 0, 0, // [0, 1, :]
 
-// Run the test by selecting elements in input
-void run_test_cases(
-    const Tensor& x,
-    OptTensorArrayRef indices,
-    const Tensor& expected) {
-  // Generated out tensor sharing same size and dtype with expected tensor
-  TensorFactory<ScalarType::Double> tf;
+          // [1, :, :]
+          1, 1, 1, 1, // [1, 0, :]
+          0, 0, 0, 0, // [1, 1, :]
 
-  const std::vector<int32_t> out_size(
-      expected.sizes().begin(), expected.sizes().end());
-  Tensor out = tf.ones(out_size);
+          // [2, :, :]
+          1, 1, 1, 1, // [2, 0, :]
+          0, 0, 0, 0, // [2, 1, :]
+        });
+    // clang-format on
 
-  Tensor ret = op_index_tensor_out(x, indices, out);
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(ret, expected);
-}
-} // namespace
+    // indices [0, 1, 2], [1, 0, 3], expressed two different ways
+    optional<Tensor> indices[] = {
+        optional<Tensor>(tfl.make({2}, {0, 1})),
+        optional<Tensor>(tfl.make({2}, {1, 0})),
+        optional<Tensor>(tfl.make({2}, {2, 3}))};
+
+    optional<Tensor> indices_mixed[] = {
+        optional<Tensor>(tfl.make({2}, {0, 1})),
+        optional<Tensor>(tfb.make({2}, {false, true})),
+        optional<Tensor>(tfl.make({2}, {2, 3}))};
+
+    std::vector<int32_t> out_size{2};
+
+    Tensor out_0 = tfo.zeros(out_size);
+    Tensor ret_0 = op_index_tensor_out(x, /*indices=*/indices, out_0);
+
+    EXPECT_TENSOR_EQ(ret_0, out_0);
+    EXPECT_TENSOR_EQ(ret_0, tfo.make(out_size, {0, 1}));
+
+    // Repeat the test with alternative indices representation
+
+    Tensor out_0_with_mixed = tfo.zeros(out_size);
+    Tensor ret_0_with_mixed =
+        op_index_tensor_out(x, /*indices=*/indices, out_0_with_mixed);
+
+    EXPECT_TENSOR_EQ(ret_0_with_mixed, out_0_with_mixed);
+    EXPECT_TENSOR_EQ(ret_0_with_mixed, tfo.make(out_size, {0, 1}));
+  }
+
+  /**
+   * Generic test for integral index lists
+   */
+  void test_dtype_enumerate_in_types() {
+#define TEST_ENTRY(ctype, dtype) \
+  test_dtype<ScalarType::dtype, ScalarType::Long, ScalarType::dtype>();
+
+    ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+
+#undef TEST_ENTRY
+  }
+
+  // Run the test by selecting elements in input
+  void run_test_cases(
+      const Tensor& x,
+      OptTensorArrayRef indices,
+      const Tensor& expected) {
+    // Generated out tensor sharing same size and dtype with expected tensor
+    TensorFactory<ScalarType::Double> tf;
+
+    const std::vector<int32_t> out_size(
+        expected.sizes().begin(), expected.sizes().end());
+    Tensor out = tf.ones(out_size);
+
+    Tensor ret = op_index_tensor_out(x, indices, out);
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(ret, expected);
+  }
+};
 
 //
 // Correctness Tests
 //
 
-TEST(OpIndexTensorOutTest, IndexMask) {
+TEST_F(OpIndexTensorOutTest, IndexMask) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Bool> tfb;
   // clang-format off
@@ -108,7 +179,7 @@ TEST(OpIndexTensorOutTest, IndexMask) {
   run_test_cases(x, {indices}, expected);
 }
 
-TEST(OpIndexTensorOutTest, SelectFrontDimAllIndexes) {
+TEST_F(OpIndexTensorOutTest, SelectFrontDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Int> tfi;
   TensorFactory<ScalarType::Long> tfl;
@@ -173,7 +244,7 @@ TEST(OpIndexTensorOutTest, SelectFrontDimAllIndexes) {
   run_test_cases(x, /*indices=*/indices_mixed, expected);
 }
 
-TEST(OpIndexTensorOutTest, SelectTwoValuesAtSameIndex) {
+TEST_F(OpIndexTensorOutTest, SelectTwoValuesAtSameIndex) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
@@ -210,7 +281,7 @@ TEST(OpIndexTensorOutTest, SelectTwoValuesAtSameIndex) {
   run_test_cases(x, /*indices=*/indices, expected);
 }
 
-TEST(OpIndexTensorOutTest, IndicesFewerThanInputDimSupported) {
+TEST_F(OpIndexTensorOutTest, IndicesFewerThanInputDimSupported) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Int> tfi;
   TensorFactory<ScalarType::Long> tfl;
@@ -259,7 +330,7 @@ TEST(OpIndexTensorOutTest, IndicesFewerThanInputDimSupported) {
   run_test_cases(x, /*indices=*/indices_mixed, expected);
 }
 
-TEST(OpIndexTensorOutTest, IndicesWithNullTensorsSupported) {
+TEST_F(OpIndexTensorOutTest, IndicesWithNullTensorsSupported) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
@@ -330,7 +401,7 @@ TEST(OpIndexTensorOutTest, IndicesWithNullTensorsSupported) {
   run_test_cases(x, /*indices=*/indices2, expected2);
 }
 
-TEST(OpIndexTensorOutTest, IndicesWithOnlyNullTensorsSupported) {
+TEST_F(OpIndexTensorOutTest, IndicesWithOnlyNullTensorsSupported) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -346,10 +417,11 @@ TEST(OpIndexTensorOutTest, IndicesWithOnlyNullTensorsSupported) {
   optional<Tensor> indices2[] = {
       optional<Tensor>(), optional<Tensor>(), optional<Tensor>()};
   Tensor out = tf.ones({2, 3});
-  ET_EXPECT_KERNEL_FAILURE_WITH_MSG(op_index_tensor_out(x, indices2, out), "");
+  ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_, op_index_tensor_out(x, indices2, out), "");
 }
 
-TEST(OpIndexTensorOutTest, EmptyIndicesSupported) {
+TEST_F(OpIndexTensorOutTest, EmptyIndicesSupported) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -370,82 +442,11 @@ TEST(OpIndexTensorOutTest, EmptyIndicesSupported) {
 // Test that all dtypes are supported
 //
 
-/**
- * Generic test for integral index lists
- */
-template <
-    exec_aten::ScalarType INPUT_DTYPE,
-    exec_aten::ScalarType INDEX_DTYPE,
-    exec_aten::ScalarType OUTPUT_DTYPE>
-void test_dtype() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<INDEX_DTYPE> tfl;
-  TensorFactory<OUTPUT_DTYPE> tfo;
-  TensorFactory<ScalarType::Bool> tfb;
-
-  // clang-format off
-  Tensor x = tf.make(
-      {3, 2, 4},
-      {
-        // all ones below are from x,
-        // and all zeros are from y.
-        // [0, :, :]
-        1, 1, 1, 1, // [0, 0, :]
-        0, 0, 0, 0, // [0, 1, :]
-
-        // [1, :, :]
-        1, 1, 1, 1, // [1, 0, :]
-        0, 0, 0, 0, // [1, 1, :]
-
-        // [2, :, :]
-        1, 1, 1, 1, // [2, 0, :]
-        0, 0, 0, 0, // [2, 1, :]
-      });
-  // clang-format on
-
-  // indices [0, 1, 2], [1, 0, 3], expressed two different ways
-  optional<Tensor> indices[] = {
-      optional<Tensor>(tfl.make({2}, {0, 1})),
-      optional<Tensor>(tfl.make({2}, {1, 0})),
-      optional<Tensor>(tfl.make({2}, {2, 3}))};
-
-  optional<Tensor> indices_mixed[] = {
-      optional<Tensor>(tfl.make({2}, {0, 1})),
-      optional<Tensor>(tfb.make({2}, {false, true})),
-      optional<Tensor>(tfl.make({2}, {2, 3}))};
-
-  std::vector<int32_t> out_size{2};
-
-  Tensor out_0 = tfo.zeros(out_size);
-  Tensor ret_0 = op_index_tensor_out(x, /*indices=*/indices, out_0);
-
-  EXPECT_TENSOR_EQ(ret_0, out_0);
-  EXPECT_TENSOR_EQ(ret_0, tfo.make(out_size, {0, 1}));
-
-  // Repeat the test with alternative indices representation
-
-  Tensor out_0_with_mixed = tfo.zeros(out_size);
-  Tensor ret_0_with_mixed =
-      op_index_tensor_out(x, /*indices=*/indices, out_0_with_mixed);
-
-  EXPECT_TENSOR_EQ(ret_0_with_mixed, out_0_with_mixed);
-  EXPECT_TENSOR_EQ(ret_0_with_mixed, tfo.make(out_size, {0, 1}));
-}
-
-void test_dtype_enumerate_in_types() {
-#define TEST_ENTRY(ctype, dtype) \
-  test_dtype<ScalarType::dtype, ScalarType::Long, ScalarType::dtype>();
-
-  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
-
-#undef TEST_ENTRY
-}
-
-TEST(OpIndexTensorOutTest, AllDtypesSupportedForInput) {
+TEST_F(OpIndexTensorOutTest, AllDtypesSupportedForInput) {
   test_dtype_enumerate_in_types();
 }
 
-TEST(OpIndexTensorOutTest, AllDtypesSupportedForIndex) {
+TEST_F(OpIndexTensorOutTest, AllDtypesSupportedForIndex) {
   test_dtype<ScalarType::Double, ScalarType::Long, ScalarType::Double>();
   test_dtype<ScalarType::Double, ScalarType::Int, ScalarType::Double>();
 }
@@ -454,7 +455,7 @@ TEST(OpIndexTensorOutTest, AllDtypesSupportedForIndex) {
 // Death Tests
 //
 
-TEST(OpIndexTensorOutTest, IndexOutOfBoundDies) {
+TEST_F(OpIndexTensorOutTest, IndexOutOfBoundDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -463,10 +464,10 @@ TEST(OpIndexTensorOutTest, IndexOutOfBoundDies) {
   Tensor index = tfl.make({1}, {5});
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
-      op_index_tensor_out(x, /*indices=*/{index}, out), "");
+      context_, op_index_tensor_out(x, /*indices=*/{index}, out), "");
 }
 
-TEST(OpIndexTensorOutTest, NegativeIndexOutOfBoundDies) {
+TEST_F(OpIndexTensorOutTest, NegativeIndexOutOfBoundDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -475,10 +476,10 @@ TEST(OpIndexTensorOutTest, NegativeIndexOutOfBoundDies) {
   Tensor index = tfl.make({1}, {-5});
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
-      op_index_tensor_out(x, /*indices=*/{index}, out), "");
+      context_, op_index_tensor_out(x, /*indices=*/{index}, out), "");
 }
 
-TEST(OpIndexTensorOutTest, TooManyBooleanIndexCountDies) {
+TEST_F(OpIndexTensorOutTest, TooManyBooleanIndexCountDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -487,10 +488,10 @@ TEST(OpIndexTensorOutTest, TooManyBooleanIndexCountDies) {
   Tensor index = tfb.make({3}, {true, false, false});
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
-      op_index_tensor_out(x, /*indices=*/{index}, out), "");
+      context_, op_index_tensor_out(x, /*indices=*/{index}, out), "");
 }
 
-TEST(OpIndexTensorOutTest, TooFewBooleanIndexCountDies) {
+TEST_F(OpIndexTensorOutTest, TooFewBooleanIndexCountDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -500,10 +501,10 @@ TEST(OpIndexTensorOutTest, TooFewBooleanIndexCountDies) {
 
   // ATen kernel will throw exception instead of death
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
-      op_index_tensor_out(x, /*indices=*/{index}, out), "");
+      context_, op_index_tensor_out(x, /*indices=*/{index}, out), "");
 }
 
-TEST(OpIndexTensorOutTest, MismatchedIndexMaskDies) {
+TEST_F(OpIndexTensorOutTest, MismatchedIndexMaskDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -513,10 +514,10 @@ TEST(OpIndexTensorOutTest, MismatchedIndexMaskDies) {
 
   // ATen kernel will throw exception instead of death
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
-      op_index_tensor_out(x, /*indices=*/{index}, out), "");
+      context_, op_index_tensor_out(x, /*indices=*/{index}, out), "");
 }
 
-TEST(OpIndexTensorOutTest, MismatchedOutputDimDies) {
+TEST_F(OpIndexTensorOutTest, MismatchedOutputDimDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -527,10 +528,10 @@ TEST(OpIndexTensorOutTest, MismatchedOutputDimDies) {
   Tensor out = tf.zeros({2, 4});
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
-      op_index_tensor_out(x, /*indices=*/{index}, out), "");
+      context_, op_index_tensor_out(x, /*indices=*/{index}, out), "");
 }
 
-TEST(OpIndexTensorOutTest, InvalidIndicesDtypeDies) {
+TEST_F(OpIndexTensorOutTest, InvalidIndicesDtypeDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Float> tff;
 
@@ -540,10 +541,10 @@ TEST(OpIndexTensorOutTest, InvalidIndicesDtypeDies) {
   Tensor out = tf.zeros({1, 4, 7, 5});
 
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
-      op_index_tensor_out(x, /*indices=*/{index}, out), "");
+      context_, op_index_tensor_out(x, /*indices=*/{index}, out), "");
 }
 
-TEST(OpIndexTensorOutTest, InvalidIndicesShapesDies) {
+TEST_F(OpIndexTensorOutTest, InvalidIndicesShapesDies) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Long> tfl;
 
@@ -556,10 +557,11 @@ TEST(OpIndexTensorOutTest, InvalidIndicesShapesDies) {
   Tensor out = tf.ones({3, 7, 5});
   // clang-format on
 
-  ET_EXPECT_KERNEL_FAILURE_WITH_MSG(op_index_tensor_out(x, indices, out), "");
+  ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_, op_index_tensor_out(x, indices, out), "");
 }
 
-TEST(OpIndexTensorOutTest, InvalidIndicesShapeDies2) {
+TEST_F(OpIndexTensorOutTest, InvalidIndicesShapeDies2) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "";
   }
@@ -575,7 +577,8 @@ TEST(OpIndexTensorOutTest, InvalidIndicesShapeDies2) {
   Tensor out = tf.ones({4});
   // clang-format on
 
-  ET_EXPECT_KERNEL_FAILURE_WITH_MSG(op_index_tensor_out(x, indices, out), "");
+  ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
+      context_, op_index_tensor_out(x, indices, out), "");
 }
 
 //
@@ -583,7 +586,7 @@ TEST(OpIndexTensorOutTest, InvalidIndicesShapeDies2) {
 //
 
 // Test whether resize works when out is having larger size
-TEST(OpIndexTensorOutTest, UpperBoundOutTensor) {
+TEST_F(OpIndexTensorOutTest, UpperBoundOutTensor) {
   TensorFactory<ScalarType::Double> tf;
   TensorFactory<ScalarType::Long> tfl;
   // clang-format off
diff --git a/kernels/test/op_isinf_test.cpp b/kernels/test/op_isinf_test.cpp
index d986596317..3b544dc65f 100644
--- a/kernels/test/op_isinf_test.cpp
+++ b/kernels/test/op_isinf_test.cpp
@@ -20,12 +20,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_isinf_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::isinf_outf(context, self, out);
-}
+class OpIsInfTest : public OperatorTest {
+ protected:
+  Tensor& op_isinf_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::isinf_outf(context_, self, out);
+  }
+};
 
-TEST(OpIsInfTest, SanityCheckFloat) {
+TEST_F(OpIsInfTest, SanityCheckFloat) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -40,7 +42,7 @@ TEST(OpIsInfTest, SanityCheckFloat) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpIsInfTest, SanityCheckHalf) {
+TEST_F(OpIsInfTest, SanityCheckHalf) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -58,7 +60,7 @@ TEST(OpIsInfTest, SanityCheckHalf) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpIsInfTest, SanityCheckByte) {
+TEST_F(OpIsInfTest, SanityCheckByte) {
   TensorFactory<ScalarType::Byte> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -72,7 +74,7 @@ TEST(OpIsInfTest, SanityCheckByte) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpIsInfTest, SanityCheckBool) {
+TEST_F(OpIsInfTest, SanityCheckBool) {
   TensorFactory<ScalarType::Bool> tfb;
 
   Tensor in = tfb.make({1, 5}, {true, false, true, true, false});
@@ -85,11 +87,11 @@ TEST(OpIsInfTest, SanityCheckBool) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpIsInfTest, SanityCheckOutDtype) {
+TEST_F(OpIsInfTest, SanityCheckOutDtype) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor in = tf.make({1, 5}, {1, 2, 3, 4, 5});
   Tensor out = tf.zeros({1, 5});
 
-  ET_EXPECT_KERNEL_FAILURE(op_isinf_out(in, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_isinf_out(in, out));
 }
diff --git a/kernels/test/op_isnan_test.cpp b/kernels/test/op_isnan_test.cpp
index 60505fd694..2894fc88cc 100644
--- a/kernels/test/op_isnan_test.cpp
+++ b/kernels/test/op_isnan_test.cpp
@@ -20,12 +20,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_isnan_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::isnan_outf(context, self, out);
-}
+class OpIsNanTest : public OperatorTest {
+ protected:
+  Tensor& op_isnan_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::isnan_outf(context_, self, out);
+  }
+};
 
-TEST(OpIsNanTest, SanityCheckFloat) {
+TEST_F(OpIsNanTest, SanityCheckFloat) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -40,7 +42,7 @@ TEST(OpIsNanTest, SanityCheckFloat) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpIsNanTest, SanityCheckHalf) {
+TEST_F(OpIsNanTest, SanityCheckHalf) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -58,7 +60,7 @@ TEST(OpIsNanTest, SanityCheckHalf) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpIsNanTest, SanityCheckByte) {
+TEST_F(OpIsNanTest, SanityCheckByte) {
   TensorFactory<ScalarType::Byte> tf;
   TensorFactory<ScalarType::Bool> tfb;
 
@@ -72,7 +74,7 @@ TEST(OpIsNanTest, SanityCheckByte) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpIsNanTest, SanityCheckBool) {
+TEST_F(OpIsNanTest, SanityCheckBool) {
   TensorFactory<ScalarType::Bool> tfb;
 
   Tensor in = tfb.make({1, 5}, {true, false, true, true, false});
@@ -85,11 +87,11 @@ TEST(OpIsNanTest, SanityCheckBool) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpIsNanTest, SanityCheckOutDtype) {
+TEST_F(OpIsNanTest, SanityCheckOutDtype) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor in = tf.make({1, 5}, {1, 2, 3, 4, 5});
   Tensor out = tf.zeros({1, 5});
 
-  ET_EXPECT_KERNEL_FAILURE(op_isnan_out(in, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_isnan_out(in, out));
 }
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index a54294ebfe..afbe42ab7e 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -21,32 +21,49 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_le_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::le_outf(context, self, other, out);
-}
+class OpLeScalarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_le_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
+    return torch::executor::aten::le_outf(context_, self, other, out);
+  }
 
-Tensor& op_le_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::le_outf(context, self, other, out);
-}
+  template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
+  void test_le_scalar_out() {
+    TensorFactory<DTYPE_IN> tf;
+    TensorFactory<DTYPE_OUT> tf_out;
 
-template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
-void test_le_scalar_out() {
-  TensorFactory<DTYPE_IN> tf;
-  TensorFactory<DTYPE_OUT> tf_out;
+    const std::vector<int32_t> sizes = {2, 2};
+    Tensor out = tf_out.ones(sizes);
+    Scalar other = 2;
 
-  const std::vector<int32_t> sizes = {2, 2};
-  Tensor out = tf_out.ones(sizes);
-  Scalar other = 2;
+    // Valid input should give the expected output
+    op_le_scalar_out(tf.make(sizes, /*data=*/{3, 1, 2, 4}), other, out);
+    EXPECT_TENSOR_EQ(
+        out, tf_out.make(sizes, /*data=*/{false, true, true, false}));
+  }
+};
 
-  // Valid input should give the expected output
-  op_le_scalar_out(tf.make(sizes, /*data=*/{3, 1, 2, 4}), other, out);
-  EXPECT_TENSOR_EQ(
-      out, tf_out.make(sizes, /*data=*/{false, true, true, false}));
-}
+class OpLeTensorOutTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_le_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::le_outf(context_, self, other, out);
+  }
+
+  template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
+  void test_dtype() {
+    TensorFactory<DTYPE_IN> tf_input;
+    TensorFactory<DTYPE_OUT> tf_out;
+    Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
+    Tensor b = tf_input.make({2, 2}, {1, 4, 2, 3});
+    Tensor out = tf_out.zeros({2, 2});
+
+    op_le_tensor_out(a, b, out);
+    EXPECT_TENSOR_EQ(out, tf_out.make({2, 2}, {false, true, true, false}));
+  }
+};
 
-TEST(OpLeScalarOutKernelTest, AllRealInputBoolOutputSupport) {
+TEST_F(OpLeScalarOutTest, AllRealInputBoolOutputSupport) {
 #define TEST_ENTRY(ctype_in, dtype_in, ctype_out, dtype_out) \
   test_le_scalar_out<ScalarType::dtype_in, ScalarType::dtype_out>();
 
@@ -60,7 +77,7 @@ TEST(OpLeScalarOutKernelTest, AllRealInputBoolOutputSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpLeScalarOutKernelTest, BoolInputDtype) {
+TEST_F(OpLeScalarOutTest, BoolInputDtype) {
   TensorFactory<ScalarType::Bool> tf_bool;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -74,7 +91,7 @@ TEST(OpLeScalarOutKernelTest, BoolInputDtype) {
 }
 
 // Mismatched shape tests.
-TEST(OpLeScalarOutKernelTest, MismatchedInOutShapesDies) {
+TEST_F(OpLeScalarOutTest, MismatchedInOutShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -85,10 +102,10 @@ TEST(OpLeScalarOutKernelTest, MismatchedInOutShapesDies) {
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
   Scalar other = 3;
 
-  ET_EXPECT_KERNEL_FAILURE(op_le_scalar_out(a, other, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_le_scalar_out(a, other, out));
 }
 
-TEST(OpLeScalarOutKernelTest, DynamicOutShapeTest) {
+TEST_F(OpLeScalarOutTest, DynamicOutShapeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -103,19 +120,7 @@ TEST(OpLeScalarOutKernelTest, DynamicOutShapeTest) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{false, true, true, false}));
 }
 
-template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
-void test_dtype() {
-  TensorFactory<DTYPE_IN> tf_input;
-  TensorFactory<DTYPE_OUT> tf_out;
-  Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
-  Tensor b = tf_input.make({2, 2}, {1, 4, 2, 3});
-  Tensor out = tf_out.zeros({2, 2});
-
-  op_le_tensor_out(a, b, out);
-  EXPECT_TENSOR_EQ(out, tf_out.make({2, 2}, {false, true, true, false}));
-}
-
-TEST(OpLeTensorOutKernelTest, AllDtypesSupported) {
+TEST_F(OpLeTensorOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype_in, dtype_in, ctype_out, dtype_out) \
   test_dtype<ScalarType::dtype_in, ScalarType::dtype_out>();
 
@@ -129,7 +134,7 @@ TEST(OpLeTensorOutKernelTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
-TEST(OpLeTensorOutKernelTest, MismatchedInShapesDies) {
+TEST_F(OpLeTensorOutTest, MismatchedInShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -140,10 +145,10 @@ TEST(OpLeTensorOutKernelTest, MismatchedInShapesDies) {
   Tensor b = tf_int.ones(/*sizes=*/{2, 2});
   Tensor out = tf_bool.ones(/*sizes=*/{4});
 
-  ET_EXPECT_KERNEL_FAILURE(op_le_tensor_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_le_tensor_out(a, b, out));
 }
 
-TEST(OpLeTensorOutKernelTest, MismatchedInOutShapesDies) {
+TEST_F(OpLeTensorOutTest, MismatchedInOutShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -154,10 +159,10 @@ TEST(OpLeTensorOutKernelTest, MismatchedInOutShapesDies) {
   Tensor b = tf_int.ones(/*sizes=*/{4});
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_le_tensor_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_le_tensor_out(a, b, out));
 }
 
-TEST(OpLeTensorOutKernelTest, DynamicOutShapeTest) {
+TEST_F(OpLeTensorOutTest, DynamicOutShapeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor a = tf.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
diff --git a/kernels/test/op_leaky_relu_test.cpp b/kernels/test/op_leaky_relu_test.cpp
index 65ee712143..1c5ca68152 100644
--- a/kernels/test/op_leaky_relu_test.cpp
+++ b/kernels/test/op_leaky_relu_test.cpp
@@ -20,14 +20,18 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_leaky_relu_out(const Tensor& in, const Scalar& negative_slope, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::leaky_relu_outf(
-      context, in, negative_slope, out);
-}
+class OpLeakyReluTest : public OperatorTest {
+ protected:
+  Tensor& op_leaky_relu_out(
+      const Tensor& in,
+      const Scalar& negative_slope,
+      Tensor& out) {
+    return torch::executor::aten::leaky_relu_outf(
+        context_, in, negative_slope, out);
+  }
+};
 
-TEST(OpLeakyReluTest, SanityCheck) {
+TEST_F(OpLeakyReluTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
   Tensor in = tf.ones({2, 2});
   Tensor out = tf.zeros({2, 2});
diff --git a/kernels/test/op_lift_fresh_copy_test.cpp b/kernels/test/op_lift_fresh_copy_test.cpp
index 7161a68a4c..a63b2970f4 100644
--- a/kernels/test/op_lift_fresh_copy_test.cpp
+++ b/kernels/test/op_lift_fresh_copy_test.cpp
@@ -21,66 +21,66 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-namespace {
-Tensor& op_lift_fresh_copy_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::lift_fresh_copy_outf(context, self, out);
-}
-} // namespace
+class OpLiftFreshCopyTest : public OperatorTest {
+ protected:
+  Tensor& op_lift_fresh_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::lift_fresh_copy_outf(context_, self, out);
+  }
 
-// regular test for lift_fresh_copy.out
-// test if lift_fresh_copy.out works well under all kinds of legal input type.
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-  Tensor self = tf.ones(/*sizes=*/{2, 4});
-  Tensor out = tf.zeros(/*sizes=*/{2, 4});
+  // test if lift_fresh_copy.out works well under all kinds of legal input type.
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+    Tensor self = tf.ones(/*sizes=*/{2, 4});
+    Tensor out = tf.zeros(/*sizes=*/{2, 4});
 
-  ::op_lift_fresh_copy_out(self, out);
-  EXPECT_TENSOR_EQ(self, out);
+    op_lift_fresh_copy_out(self, out);
+    EXPECT_TENSOR_EQ(self, out);
 
-  Tensor self_empty = tf.make(/*sizes=*/{}, /*data=*/{1});
-  Tensor out_empty = tf.make(/*sizes=*/{}, /*data=*/{0});
+    Tensor self_empty = tf.make(/*sizes=*/{}, /*data=*/{1});
+    Tensor out_empty = tf.make(/*sizes=*/{}, /*data=*/{0});
 
-  ::op_lift_fresh_copy_out(self_empty, out_empty);
-  EXPECT_TENSOR_EQ(self_empty, out_empty);
-}
+    op_lift_fresh_copy_out(self_empty, out_empty);
+    EXPECT_TENSOR_EQ(self_empty, out_empty);
+  }
 
-TEST(OpLiftFreshCopyTest, AllDtypesSupported) {
+  template <class CTYPE, ScalarType DTYPE>
+  void test_empty_input() {
+    TensorFactory<DTYPE> tf;
+    Tensor self = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
+    Tensor out = tf.zeros({3, 0, 1, 2});
+    op_lift_fresh_copy_out(self, out);
+    EXPECT_TENSOR_EQ(self, out);
+  }
+};
+
+// regular test for lift_fresh_copy.out
+TEST_F(OpLiftFreshCopyTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-template <class CTYPE, ScalarType DTYPE>
-void test_empty_input() {
-  TensorFactory<DTYPE> tf;
-  Tensor self = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
-  Tensor out = tf.zeros({3, 0, 1, 2});
-  ::op_lift_fresh_copy_out(self, out);
-  EXPECT_TENSOR_EQ(self, out);
-}
-
-TEST(OpLiftFreshCopyTest, EmptyInputSupported) {
+TEST_F(OpLiftFreshCopyTest, EmptyInputSupported) {
 #define TEST_ENTRY(ctype, dtype) test_empty_input<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpLiftFreshCopyTest, MismatchedSizesDie) {
+TEST_F(OpLiftFreshCopyTest, MismatchedSizesDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched sizes";
   }
   TensorFactory<ScalarType::Int> tf;
   Tensor self = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf.zeros({3, 2, 1, 1});
-  ET_EXPECT_KERNEL_FAILURE(::op_lift_fresh_copy_out(self, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_lift_fresh_copy_out(self, out));
 }
 
-TEST(OpLiftFreshCopyTest, MismatchedDTypeDie) {
+TEST_F(OpLiftFreshCopyTest, MismatchedDTypeDie) {
   TensorFactory<ScalarType::Int> tf_in;
   TensorFactory<ScalarType::Float> tf_out;
   Tensor self = tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf_out.zeros({3, 1, 1, 2});
-  ET_EXPECT_KERNEL_FAILURE(::op_lift_fresh_copy_out(self, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_lift_fresh_copy_out(self, out));
 }
diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
index 879de79751..6efaa1c08e 100644
--- a/kernels/test/op_log_softmax_test.cpp
+++ b/kernels/test/op_log_softmax_test.cpp
@@ -23,17 +23,50 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_log_softmax_out(
-    const Tensor& self,
-    int64_t dim,
-    bool half_to_float,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::_log_softmax_outf(
-      context, self, dim, half_to_float, out);
-}
+class OpLogSoftmaxOutTest : public OperatorTest {
+ protected:
+  Tensor& op_log_softmax_out(
+      const Tensor& self,
+      int64_t dim,
+      bool half_to_float,
+      Tensor& out) {
+    return torch::executor::aten::_log_softmax_outf(
+        context_, self, dim, half_to_float, out);
+  }
+
+  // A generic smoke test that works for the supported dtypes.
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    // Input tensor with shape (2, 3) and values (0, 1, 2, 3, 4, 5).
+    // clang-format off
+    Tensor x = tf.make(
+      {2, 3},
+      {
+        0, 1, 2,
+        3, 4, 5
+      });
+    // clang-format on
+
+    Tensor out = tf.zeros({2, 3});
+
+    op_log_softmax_out(x, /*dim=*/1, /*half_to_float*/ false, out);
+
+    // clang-format off
+    Tensor expected = tf.make(
+      {2, 3},
+      {
+        -2.40761, -1.40761, -0.407606,
+        -2.40761, -1.40761, -0.407606
+      });
+    // clang-format on
+
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+};
 
-TEST(OpLogSoftmaxOutTest, Smoke) {
+TEST_F(OpLogSoftmaxOutTest, Smoke) {
   TensorFactory<ScalarType::Float> tff;
   std::vector<int32_t> sizes = {1, 3};
   Tensor in = tff.make(sizes, {0, 1, 2});
@@ -50,38 +83,7 @@ TEST(OpLogSoftmaxOutTest, Smoke) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-/// A generic smoke test that works for the supported dtypes.
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  // Input tensor with shape (2, 3) and values (0, 1, 2, 3, 4, 5).
-  // clang-format off
-  Tensor x = tf.make(
-    {2, 3},
-    {
-      0, 1, 2,
-      3, 4, 5
-    });
-  // clang-format on
-
-  Tensor out = tf.zeros({2, 3});
-
-  op_log_softmax_out(x, /*dim=*/1, /*half_to_float*/ false, out);
-
-  // clang-format off
-  Tensor expected = tf.make(
-    {2, 3},
-    {
-      -2.40761, -1.40761, -0.407606,
-      -2.40761, -1.40761, -0.407606
-    });
-  // clang-format on
-
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
-TEST(OpLogSoftmaxOutTest, AllDtypesSupported) {
+TEST_F(OpLogSoftmaxOutTest, AllDtypesSupported) {
   if (!SupportedFeatures::get()->op_log_softmax_dtype_double) {
     GTEST_SKIP() << "This kernel does not support dtype double";
   }
@@ -93,7 +95,7 @@ TEST(OpLogSoftmaxOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpLogSoftmaxOutTest, MismatchedDimensionsDies) {
+TEST_F(OpLogSoftmaxOutTest, MismatchedDimensionsDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen currently supports mismatched dimensions";
   }
@@ -108,10 +110,10 @@ TEST(OpLogSoftmaxOutTest, MismatchedDimensionsDies) {
 
   // Dim out of bounds
   ET_EXPECT_KERNEL_FAILURE(
-      op_log_softmax_out(x, /*dim=*/3, /*half_to_float*/ false, out));
+      context_, op_log_softmax_out(x, /*dim=*/3, /*half_to_float*/ false, out));
 }
 
-TEST(OpLogSoftmaxOutTest, MismatchedDimensionSizeDies) {
+TEST_F(OpLogSoftmaxOutTest, MismatchedDimensionSizeDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen currently supports mismatched dimension size";
   }
@@ -124,10 +126,11 @@ TEST(OpLogSoftmaxOutTest, MismatchedDimensionSizeDies) {
   Tensor wrong_out = tf.zeros({2, 10, 4});
 
   ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_log_softmax_out(x, /*dim=*/1, /*half_to_float*/ false, wrong_out));
 }
 
-TEST(OpLogSoftmaxOutTest, TestWithLargeNumber) {
+TEST_F(OpLogSoftmaxOutTest, TestWithLargeNumber) {
   if (!SupportedFeatures::get()->op_log_softmax_dtype_double) {
     GTEST_SKIP() << "This kernel does not support dtype double";
   }
@@ -162,7 +165,7 @@ TEST(OpLogSoftmaxOutTest, TestWithLargeNumber) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST(OpLogSoftmaxOutTest, NegativeDim) {
+TEST_F(OpLogSoftmaxOutTest, NegativeDim) {
   if (!SupportedFeatures::get()->op_log_softmax_dtype_double) {
     GTEST_SKIP() << "This kernel does not support dtype double";
   }
@@ -218,7 +221,7 @@ TEST(OpLogSoftmaxOutTest, NegativeDim) {
 }
 
 #if !defined(USE_ATEN_LIB)
-TEST(OpLogSoftmaxOutTest, UpperBoundOutTensor) {
+TEST_F(OpLogSoftmaxOutTest, UpperBoundOutTensor) {
   TensorFactory<ScalarType::Float> tff;
 
   // Input tensor with shape (2, 3) and values (0, 1, 2, 3, 4, 5).
@@ -249,7 +252,7 @@ TEST(OpLogSoftmaxOutTest, UpperBoundOutTensor) {
 }
 #endif
 
-TEST(OpLogSoftmaxOutTest, SimpleGeneratedCase) {
+TEST_F(OpLogSoftmaxOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -303,7 +306,7 @@ TEST(OpLogSoftmaxOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpLogSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpLogSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -329,7 +332,7 @@ TEST(OpLogSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpLogSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpLogSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -355,7 +358,7 @@ TEST(OpLogSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpLogSoftmaxOutTest, DynamicShapeUnbound) {
+TEST_F(OpLogSoftmaxOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_log_test.cpp b/kernels/test/op_log_test.cpp
index 1e9a051850..887defe621 100644
--- a/kernels/test/op_log_test.cpp
+++ b/kernels/test/op_log_test.cpp
@@ -22,28 +22,44 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_log_out(const Tensor& a, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::log_outf(context, a, out);
-}
+class OpLogOutTest : public OperatorTest {
+ protected:
+  Tensor& op_log_out(const Tensor& a, Tensor& out) {
+    return torch::executor::aten::log_outf(context_, a, out);
+  }
 
-// Common testing for log operator
-template <ScalarType DTYPE, ScalarType OUT_DTYPE>
-void test__log_out() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<OUT_DTYPE> tf_out;
+  // Common testing for log operator
+  template <ScalarType DTYPE, ScalarType OUT_DTYPE>
+  void test__log_out() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<OUT_DTYPE> tf_out;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  Tensor out = tf_out.zeros(sizes);
+    Tensor out = tf_out.zeros(sizes);
 
-  // Valid input should give the expected output
-  op_log_out(tf.make(sizes, /*data=*/{0, 1, 2, 4}), out);
-  EXPECT_TENSOR_CLOSE(
-      out, tf_out.make(sizes, /*data=*/{-INFINITY, 0, 0.693147, 1.386294}));
-}
+    // Valid input should give the expected output
+    op_log_out(tf.make(sizes, /*data=*/{0, 1, 2, 4}), out);
+    EXPECT_TENSOR_CLOSE(
+        out, tf_out.make(sizes, /*data=*/{-INFINITY, 0, 0.693147, 1.386294}));
+  }
 
-TEST(OpLogOutKernelTest, AllRealInputHalfOutputSupport) {
+  // Unhandled output dtypes.
+  template <ScalarType OUTPUT_DTYPE>
+  void test_log_invalid_output_dtype_dies() {
+    TensorFactory<ScalarType::Float> tf_float;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf_float.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_log_out(in, out));
+  }
+};
+
+TEST_F(OpLogOutTest, AllRealInputHalfOutputSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -53,21 +69,21 @@ TEST(OpLogOutKernelTest, AllRealInputHalfOutputSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpLogOutKernelTest, AllRealInputFloatOutputSupport) {
+TEST_F(OpLogOutTest, AllRealInputFloatOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test__log_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpLogOutKernelTest, AllRealInputDoubleOutputSupport) {
+TEST_F(OpLogOutTest, AllRealInputDoubleOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test__log_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpLogOutKernelTest, HandleBoolInput) {
+TEST_F(OpLogOutTest, HandleBoolInput) {
   // op_log_out() handles Bool as input.
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
@@ -82,7 +98,7 @@ TEST(OpLogOutKernelTest, HandleBoolInput) {
 }
 
 // Mismatched shape tests.
-TEST(OpLogOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpLogOutTest, MismatchedShapesDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -93,31 +109,17 @@ TEST(OpLogOutKernelTest, MismatchedShapesDies) {
   Tensor a = tf_int.ones(/*sizes=*/{4});
   Tensor out = tf_float.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_log_out(a, out));
-}
-
-// Unhandled output dtypes.
-template <ScalarType OUTPUT_DTYPE>
-void test_log_invalid_output_dtype_dies() {
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf_float.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_log_out(in, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_log_out(a, out));
 }
 
-TEST(OpLogOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpLogOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_log_invalid_output_dtype_dies<ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpLogOutKernelTest, SimpleGeneratedCase) {
+TEST_F(OpLogOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -146,7 +148,7 @@ TEST(OpLogOutKernelTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpLogOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpLogOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -172,7 +174,7 @@ TEST(OpLogOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpLogOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpLogOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -198,7 +200,7 @@ TEST(OpLogOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpLogOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpLogOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_logical_and_test.cpp b/kernels/test/op_logical_and_test.cpp
index 400dc83684..68422ee749 100644
--- a/kernels/test/op_logical_and_test.cpp
+++ b/kernels/test/op_logical_and_test.cpp
@@ -19,8 +19,10 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_logical_and_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::logical_and_outf(context, self, other, out);
-}
+class OpLogicalAndTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_logical_and_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::logical_and_outf(context_, self, other, out);
+  }
+};
diff --git a/kernels/test/op_logical_not_test.cpp b/kernels/test/op_logical_not_test.cpp
index 66a6d9fbd5..c817359860 100644
--- a/kernels/test/op_logical_not_test.cpp
+++ b/kernels/test/op_logical_not_test.cpp
@@ -23,12 +23,87 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_logical_not_out(const Tensor& input, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::logical_not_outf(context, input, out);
-}
+class OpLogicalNotOutTest : public OperatorTest {
+ protected:
+  Tensor& op_logical_not_out(const Tensor& input, Tensor& out) {
+    return torch::executor::aten::logical_not_outf(context_, input, out);
+  }
+
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_logical_not_out() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // clang-format off
+    Tensor in = tf_in.make(
+      {2, 4},
+      {
+        0, 1, 0, 1,
+        1, 0, 1, 0
+      });
+    Tensor bool_in = tf_in.make(
+      {2, 4},
+      {
+        false, true,  false, true,
+        true,  false, true,  false,
+      });
+    // clang-format on
+
+    Tensor out = tf_out.zeros({2, 4});
+    Tensor bool_out = tf_out.zeros({2, 4});
+
+    op_logical_not_out(in, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 4},
+      {
+        1, 0, 1, 0,
+        0, 1, 0, 1
+      }));
+    // clang-format on
+
+    op_logical_not_out(bool_in, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 4},
+      {
+        1, 0, 1, 0,
+        0, 1, 0, 1
+      }));
+    // clang-format on
+
+    op_logical_not_out(in, bool_out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(bool_out, tf_out.make(
+      {2, 4},
+      {
+        true,  false, true,  false,
+        false, true,  false, true
+      }));
+    // clang-format on
+  }
+
+  template <ScalarType OUT_DTYPE>
+  void test_logical_not_out_float() {
+    TensorFactory<ScalarType::Float> tf_float;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    Tensor in = tf_float.make(
+        {1, 4},
+        {
+            INFINITY,
+            NAN,
+            -INFINITY,
+            0,
+        });
+    Tensor out = tf_out.zeros(/*size=*/{1, 4});
+
+    op_logical_not_out(in, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(/*size=*/{1, 4}, {0, 0, 0, 1}));
+  }
+};
 
-TEST(OpLogicalNotOutTest, MismatchedDimensionsDies) {
+TEST_F(OpLogicalNotOutTest, MismatchedDimensionsDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched dimensions";
   }
@@ -38,64 +113,10 @@ TEST(OpLogicalNotOutTest, MismatchedDimensionsDies) {
   Tensor in = tff.make(size, {0, 0, 1, 0});
   Tensor out = tff.zeros(/*size=*/{4, 1});
 
-  ET_EXPECT_KERNEL_FAILURE(op_logical_not_out(in, out));
-}
-
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_logical_not_out() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // clang-format off
-  Tensor in = tf_in.make(
-    {2, 4},
-    {
-      0, 1, 0, 1,
-      1, 0, 1, 0
-    });
-  Tensor bool_in = tf_in.make(
-    {2, 4},
-    {
-      false, true,  false, true,
-      true,  false, true,  false,
-    });
-  // clang-format on
-
-  Tensor out = tf_out.zeros({2, 4});
-  Tensor bool_out = tf_out.zeros({2, 4});
-
-  op_logical_not_out(in, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 4},
-    {
-      1, 0, 1, 0,
-      0, 1, 0, 1
-    }));
-  // clang-format on
-
-  op_logical_not_out(bool_in, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 4},
-    {
-      1, 0, 1, 0,
-      0, 1, 0, 1
-    }));
-  // clang-format on
-
-  op_logical_not_out(in, bool_out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(bool_out, tf_out.make(
-    {2, 4},
-    {
-      true,  false, true,  false,
-      false, true,  false, true
-    }));
-  // clang-format on
+  ET_EXPECT_KERNEL_FAILURE(context_, op_logical_not_out(in, out));
 }
 
-TEST(OpLogicalNotOutTest, AllTypePasses) {
+TEST_F(OpLogicalNotOutTest, AllTypePasses) {
 // Use a two layer switch to hanldle each possible data pair
 #define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
   test_logical_not_out<ScalarType::INPUT_DTYPE, ScalarType::OUTPUT_DTYPE>();
@@ -108,26 +129,7 @@ TEST(OpLogicalNotOutTest, AllTypePasses) {
 #undef TEST_KERNEL
 }
 
-template <ScalarType OUT_DTYPE>
-void test_logical_not_out_float() {
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  Tensor in = tf_float.make(
-      {1, 4},
-      {
-          INFINITY,
-          NAN,
-          -INFINITY,
-          0,
-      });
-  Tensor out = tf_out.zeros(/*size=*/{1, 4});
-
-  op_logical_not_out(in, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(/*size=*/{1, 4}, {0, 0, 0, 1}));
-}
-
-TEST(OpLogicalNotOutTest, FloatSpecificTest) {
+TEST_F(OpLogicalNotOutTest, FloatSpecificTest) {
 // Float/double specific +/-Inf and NAN test
 #define TEST_ENTRY_FLOAT_SPECIFIC_CASES(ctype, dtype) \
   test_logical_not_out_float<ScalarType::dtype>();
diff --git a/kernels/test/op_logical_or_test.cpp b/kernels/test/op_logical_or_test.cpp
index f79c729b5b..e8dfb5e589 100644
--- a/kernels/test/op_logical_or_test.cpp
+++ b/kernels/test/op_logical_or_test.cpp
@@ -19,8 +19,10 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_logical_or_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::logical_or_outf(context, self, other, out);
-}
+class OpLogicalOrTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_logical_or_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::logical_or_outf(context_, self, other, out);
+  }
+};
diff --git a/kernels/test/op_logical_xor_test.cpp b/kernels/test/op_logical_xor_test.cpp
index ed0f1b33f0..ab162a2796 100644
--- a/kernels/test/op_logical_xor_test.cpp
+++ b/kernels/test/op_logical_xor_test.cpp
@@ -19,8 +19,10 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_logical_xor_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::logical_xor_outf(context, self, other, out);
-}
+class OpLogicalXorTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_logical_xor_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::logical_xor_outf(context_, self, other, out);
+  }
+};
diff --git a/kernels/test/op_logit_test.cpp b/kernels/test/op_logit_test.cpp
index 98db6162bc..8faafa1ddb 100644
--- a/kernels/test/op_logit_test.cpp
+++ b/kernels/test/op_logit_test.cpp
@@ -21,86 +21,102 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_logit_out(const Tensor& self, optional<double> eps, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::logit_outf(context, self, eps, out);
-}
+class OpLogitOutTest : public OperatorTest {
+ protected:
+  Tensor& op_logit_out(const Tensor& self, optional<double> eps, Tensor& out) {
+    return torch::executor::aten::logit_outf(context_, self, eps, out);
+  }
 
-// Common testing for logit operator
-template <ScalarType DTYPE, ScalarType OUTPUT_DTYPE>
-void test_integer_logit_out() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
+  // Common testing for logit operator
+  template <ScalarType DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_integer_logit_out() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the logit operator.
-  Tensor out = tf_out.zeros(sizes);
+    // Destination for the logit operator.
+    Tensor out = tf_out.zeros(sizes);
 
-  op_logit_out(tf.make(sizes, /*data=*/{1, 2, 4, 8}), 0, out);
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make(sizes, /*data=*/{INFINITY, INFINITY, INFINITY, INFINITY}));
-}
+    op_logit_out(tf.make(sizes, /*data=*/{1, 2, 4, 8}), 0, out);
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make(sizes, /*data=*/{INFINITY, INFINITY, INFINITY, INFINITY}));
+  }
 
-template <>
-void test_integer_logit_out<ScalarType::Float, ScalarType::Float>() {
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<ScalarType::Float> tf_out;
+  template <>
+  void test_integer_logit_out<ScalarType::Float, ScalarType::Float>() {
+    TensorFactory<ScalarType::Float> tf;
+    TensorFactory<ScalarType::Float> tf_out;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the logit operator.
-  Tensor out = tf_out.zeros(sizes);
+    // Destination for the logit operator.
+    Tensor out = tf_out.zeros(sizes);
 
-  // Check that it matches (or close to) the expected output.
-  op_logit_out(tf.make(sizes, /*data=*/{.1, .2, .4, .8}), 0, out);
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make(
-          sizes, /*data=*/{-2.197224, -1.386294, -0.405465, 1.3862943}));
-}
+    // Check that it matches (or close to) the expected output.
+    op_logit_out(tf.make(sizes, /*data=*/{.1, .2, .4, .8}), 0, out);
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make(
+            sizes, /*data=*/{-2.197224, -1.386294, -0.405465, 1.3862943}));
+  }
 
-// Common testing for logit operator
-template <ScalarType DTYPE, ScalarType OUTPUT_DTYPE>
-void test_integer_logit_out_eps_set() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
+  // Common testing for logit operator
+  template <ScalarType DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_integer_logit_out_eps_set() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the logit operator.
-  Tensor out = tf_out.zeros(sizes);
+    // Destination for the logit operator.
+    Tensor out = tf_out.zeros(sizes);
 
-  op_logit_out(tf.make(sizes, /*data=*/{1, 2, 4, 8}), 0.1, out);
+    op_logit_out(tf.make(sizes, /*data=*/{1, 2, 4, 8}), 0.1, out);
 
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make(sizes, /*data=*/{2.197224, 2.197224, 2.197224, 2.197224}));
-}
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make(sizes, /*data=*/{2.197224, 2.197224, 2.197224, 2.197224}));
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType OUTPUT_DTYPE>
+  void test_logit_invalid_output_dtype_dies() {
+    TensorFactory<ScalarType::Float> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
 
-TEST(OpLogitOutKernelTest, AllRealInputFloatOutputSupport) {
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_logit_out(in, 0, out));
+  }
+};
+
+TEST_F(OpLogitOutTest, AllRealInputFloatOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_integer_logit_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpLogitOutKernelTest, AllRealInputDoubleOutputSupport) {
+TEST_F(OpLogitOutTest, AllRealInputDoubleOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_integer_logit_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
-TEST(OpLogitOutKernelTest, AllRealInputFloatOutputSupportEpsSet) {
+TEST_F(OpLogitOutTest, AllRealInputFloatOutputSupportEpsSet) {
 #define TEST_ENTRY(ctype, dtype) \
   test_integer_logit_out_eps_set<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpLogitOutKernelTest, AllRealInputDoubleOutputSupportEpsSet) {
+TEST_F(OpLogitOutTest, AllRealInputDoubleOutputSupportEpsSet) {
 #define TEST_ENTRY(ctype, dtype) \
   test_integer_logit_out_eps_set<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
@@ -108,7 +124,7 @@ TEST(OpLogitOutKernelTest, AllRealInputDoubleOutputSupportEpsSet) {
 }
 
 // Mismatched shape tests.
-TEST(OpLogitOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpLogitOutTest, MismatchedShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -118,31 +134,17 @@ TEST(OpLogitOutKernelTest, MismatchedShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf_out.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_logit_out(a, 0, out));
-}
-
-// Unhandled output dtypes.
-template <ScalarType OUTPUT_DTYPE>
-void test_logit_invalid_output_dtype_dies() {
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_logit_out(in, 0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_logit_out(a, 0, out));
 }
 
-TEST(OpLogitOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpLogitOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_logit_invalid_output_dtype_dies<ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpLogitOutKernelTest, SimpleGeneratedCase) {
+TEST_F(OpLogitOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -196,7 +198,7 @@ TEST(OpLogitOutKernelTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpLogitOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpLogitOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -222,7 +224,7 @@ TEST(OpLogitOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpLogitOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpLogitOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -248,7 +250,7 @@ TEST(OpLogitOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpLogitOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpLogitOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_lt_test.cpp b/kernels/test/op_lt_test.cpp
index c0f3f064ef..34ff9c93b6 100644
--- a/kernels/test/op_lt_test.cpp
+++ b/kernels/test/op_lt_test.cpp
@@ -21,32 +21,49 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_lt_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::lt_outf(context, self, other, out);
-}
+class OpLtScalarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_lt_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
+    return torch::executor::aten::lt_outf(context_, self, other, out);
+  }
 
-Tensor& op_lt_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::lt_outf(context, self, other, out);
-}
+  template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
+  void test_lt_scalar_out() {
+    TensorFactory<DTYPE_IN> tf;
+    TensorFactory<DTYPE_OUT> tf_out;
 
-template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
-void test_lt_scalar_out() {
-  TensorFactory<DTYPE_IN> tf;
-  TensorFactory<DTYPE_OUT> tf_out;
+    const std::vector<int32_t> sizes = {2, 2};
+    Tensor out = tf_out.ones(sizes);
+    Scalar other = 2;
 
-  const std::vector<int32_t> sizes = {2, 2};
-  Tensor out = tf_out.ones(sizes);
-  Scalar other = 2;
+    // Valid input should give the expected output
+    op_lt_scalar_out(tf.make(sizes, /*data=*/{3, 1, 2, 4}), other, out);
+    EXPECT_TENSOR_EQ(
+        out, tf_out.make(sizes, /*data=*/{false, true, false, false}));
+  }
+};
 
-  // Valid input should give the expected output
-  op_lt_scalar_out(tf.make(sizes, /*data=*/{3, 1, 2, 4}), other, out);
-  EXPECT_TENSOR_EQ(
-      out, tf_out.make(sizes, /*data=*/{false, true, false, false}));
-}
+class OpLtTensorOutTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_lt_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::lt_outf(context_, self, other, out);
+  }
+
+  template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
+  void test_dtype() {
+    TensorFactory<DTYPE_IN> tf_input;
+    TensorFactory<DTYPE_OUT> tf_out;
+    Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
+    Tensor b = tf_input.make({2, 2}, {1, 4, 2, 3});
+    Tensor out = tf_out.zeros({2, 2});
+
+    op_lt_tensor_out(a, b, out);
+    EXPECT_TENSOR_EQ(out, tf_out.make({2, 2}, {false, true, false, false}));
+  }
+};
 
-TEST(OpLtScalarOutKernelTest, AllRealInputBoolOutputSupport) {
+TEST_F(OpLtScalarOutTest, AllRealInputBoolOutputSupport) {
 #define TEST_ENTRY(ctype_in, dtype_in, ctype_out, dtype_out) \
   test_lt_scalar_out<ScalarType::dtype_in, ScalarType::dtype_out>();
 
@@ -60,7 +77,7 @@ TEST(OpLtScalarOutKernelTest, AllRealInputBoolOutputSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpLtScalarOutKernelTest, BoolInputDtype) {
+TEST_F(OpLtScalarOutTest, BoolInputDtype) {
   TensorFactory<ScalarType::Bool> tf_bool;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -74,7 +91,7 @@ TEST(OpLtScalarOutKernelTest, BoolInputDtype) {
 }
 
 // Mismatched shape tests.
-TEST(OpLtScalarOutKernelTest, MismatchedInOutShapesDies) {
+TEST_F(OpLtScalarOutTest, MismatchedInOutShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -85,10 +102,10 @@ TEST(OpLtScalarOutKernelTest, MismatchedInOutShapesDies) {
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
   Scalar other = 3;
 
-  ET_EXPECT_KERNEL_FAILURE(op_lt_scalar_out(a, other, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_lt_scalar_out(a, other, out));
 }
 
-TEST(OpLtScalarOutKernelTest, DynamicOutShapeTest) {
+TEST_F(OpLtScalarOutTest, DynamicOutShapeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -103,19 +120,7 @@ TEST(OpLtScalarOutKernelTest, DynamicOutShapeTest) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{false, true, false, false}));
 }
 
-template <ScalarType DTYPE_IN, ScalarType DTYPE_OUT>
-void test_dtype() {
-  TensorFactory<DTYPE_IN> tf_input;
-  TensorFactory<DTYPE_OUT> tf_out;
-  Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
-  Tensor b = tf_input.make({2, 2}, {1, 4, 2, 3});
-  Tensor out = tf_out.zeros({2, 2});
-
-  op_lt_tensor_out(a, b, out);
-  EXPECT_TENSOR_EQ(out, tf_out.make({2, 2}, {false, true, false, false}));
-}
-
-TEST(OpLtTensorOutKernelTest, AllDtypesSupported) {
+TEST_F(OpLtTensorOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype_in, dtype_in, ctype_out, dtype_out) \
   test_dtype<ScalarType::dtype_in, ScalarType::dtype_out>();
 
@@ -129,7 +134,7 @@ TEST(OpLtTensorOutKernelTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
-TEST(OpLtTensorOutKernelTest, MismatchedInShapesDies) {
+TEST_F(OpLtTensorOutTest, MismatchedInShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -140,10 +145,10 @@ TEST(OpLtTensorOutKernelTest, MismatchedInShapesDies) {
   Tensor b = tf_int.ones(/*sizes=*/{2, 2});
   Tensor out = tf_bool.ones(/*sizes=*/{4});
 
-  ET_EXPECT_KERNEL_FAILURE(op_lt_tensor_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_lt_tensor_out(a, b, out));
 }
 
-TEST(OpLtTensorOutKernelTest, MismatchedInOutShapesDies) {
+TEST_F(OpLtTensorOutTest, MismatchedInOutShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -154,10 +159,10 @@ TEST(OpLtTensorOutKernelTest, MismatchedInOutShapesDies) {
   Tensor b = tf_int.ones(/*sizes=*/{4});
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_lt_tensor_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_lt_tensor_out(a, b, out));
 }
 
-TEST(OpLtTensorOutKernelTest, DynamicOutShapeTest) {
+TEST_F(OpLtTensorOutTest, DynamicOutShapeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor a = tf.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
diff --git a/kernels/test/op_masked_fill_test.cpp b/kernels/test/op_masked_fill_test.cpp
index 4e13035203..d7ed825640 100644
--- a/kernels/test/op_masked_fill_test.cpp
+++ b/kernels/test/op_masked_fill_test.cpp
@@ -21,58 +21,83 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_masked_fill_scalar_out(
-    const Tensor& self,
-    const Tensor& mask,
-    const Scalar& value,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::masked_fill_outf(
-      context, self, mask, value, out);
-}
-// Common testing for masked fill of integer Tensor.
-template <ScalarType DTYPE>
-void test_integer_masked_fill_scalar_out() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<ScalarType::Bool> tf_bool;
+class OpMaskedFillTest : public OperatorTest {
+ protected:
+  Tensor& op_masked_fill_scalar_out(
+      const Tensor& self,
+      const Tensor& mask,
+      const Scalar& value,
+      Tensor& out) {
+    return torch::executor::aten::masked_fill_outf(
+        context_, self, mask, value, out);
+  }
 
-  const std::vector<int32_t> sizes = {2, 2};
+  // Common testing for masked fill of integer Tensor.
+  template <ScalarType DTYPE>
+  void test_integer_masked_fill_scalar_out() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<ScalarType::Bool> tf_bool;
 
-  // Destination for the masked_fill.
-  Tensor out = tf.zeros(sizes);
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Masked fill half of the tensor.
-  op_masked_fill_scalar_out(
-      tf.make(sizes, /*data=*/{23, 29, 31, 37}),
-      tf_bool.make(sizes, /*data=*/{false, true, true, false}),
-      /*value=*/71,
-      out);
+    // Destination for the masked_fill.
+    Tensor out = tf.zeros(sizes);
 
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{23, 71, 71, 37}));
-}
+    // Masked fill half of the tensor.
+    op_masked_fill_scalar_out(
+        tf.make(sizes, /*data=*/{23, 29, 31, 37}),
+        tf_bool.make(sizes, /*data=*/{false, true, true, false}),
+        /*value=*/71,
+        out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{23, 71, 71, 37}));
+  }
+
+  // Common testing for masked fill of floating point Tensor.
+  template <ScalarType DTYPE>
+  void test_floating_point_masked_fill_scalar_out() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<ScalarType::Bool> tf_bool;
 
-TEST(OpMaskedFillTest, ByteTensors) {
+    const std::vector<int32_t> sizes = {2, 2};
+
+    // Destination for the masked_fill.
+    Tensor out = tf.zeros(sizes);
+
+    // Masked fill half of the tensor.
+    op_masked_fill_scalar_out(
+        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        tf_bool.make(sizes, /*data=*/{true, false, false, true}),
+        /*value=*/3.3,
+        out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{3.3, 2.2, 4.4, 3.3}));
+  }
+};
+
+TEST_F(OpMaskedFillTest, ByteTensors) {
   test_integer_masked_fill_scalar_out<ScalarType::Byte>();
 }
 
-TEST(OpMaskedFillTest, CharTensors) {
+TEST_F(OpMaskedFillTest, CharTensors) {
   test_integer_masked_fill_scalar_out<ScalarType::Char>();
 }
 
-TEST(OpMaskedFillTest, ShortTensors) {
+TEST_F(OpMaskedFillTest, ShortTensors) {
   test_integer_masked_fill_scalar_out<ScalarType::Short>();
 }
 
-TEST(OpMaskedFillTest, IntTensors) {
+TEST_F(OpMaskedFillTest, IntTensors) {
   test_integer_masked_fill_scalar_out<ScalarType::Int>();
 }
 
-TEST(OpMaskedFillTest, LongTensors) {
+TEST_F(OpMaskedFillTest, LongTensors) {
   test_integer_masked_fill_scalar_out<ScalarType::Long>();
 }
 
-TEST(OpMaskedFillTest, IntTensorFloatAlphaDies) {
+TEST_F(OpMaskedFillTest, IntTensorFloatAlphaDies) {
   // add_out() doesn't handle floating alpha for intergal inputs
   TensorFactory<ScalarType::Int> tf;
 
@@ -83,41 +108,21 @@ TEST(OpMaskedFillTest, IntTensorFloatAlphaDies) {
 
   // Elementwise add operation on two integral tensor with floating alpha
   // should cause an assertion and kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_masked_fill_scalar_out(
-      tf.ones(sizes), tf.ones(sizes), /*alpha=*/.7, out));
-}
-
-// Common testing for masked fill of floating point Tensor.
-template <ScalarType DTYPE>
-void test_floating_point_masked_fill_scalar_out() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<ScalarType::Bool> tf_bool;
-
-  const std::vector<int32_t> sizes = {2, 2};
-
-  // Destination for the masked_fill.
-  Tensor out = tf.zeros(sizes);
-
-  // Masked fill half of the tensor.
-  op_masked_fill_scalar_out(
-      tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
-      tf_bool.make(sizes, /*data=*/{true, false, false, true}),
-      /*value=*/3.3,
-      out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{3.3, 2.2, 4.4, 3.3}));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_masked_fill_scalar_out(
+          tf.ones(sizes), tf.ones(sizes), /*alpha=*/.7, out));
 }
 
-TEST(OpMaskedFillTest, FloatTensors) {
+TEST_F(OpMaskedFillTest, FloatTensors) {
   test_floating_point_masked_fill_scalar_out<ScalarType::Float>();
 }
 
-TEST(OpMaskedFillTest, DoubleTensors) {
+TEST_F(OpMaskedFillTest, DoubleTensors) {
   test_floating_point_masked_fill_scalar_out<ScalarType::Double>();
 }
 
-TEST(OpMaskedFillTest, BoolTensors) {
+TEST_F(OpMaskedFillTest, BoolTensors) {
   TensorFactory<ScalarType::Bool> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -135,7 +140,7 @@ TEST(OpMaskedFillTest, BoolTensors) {
 }
 
 // The input tensor and value may not have different dtypes.
-TEST(OpMaskedFillTest, MismatchedInputAndValueDtypesDies) {
+TEST_F(OpMaskedFillTest, MismatchedInputAndValueDtypesDies) {
   TensorFactory<ScalarType::Byte> tf_byte;
   TensorFactory<ScalarType::Char> tf_char;
 
@@ -151,12 +156,12 @@ TEST(OpMaskedFillTest, MismatchedInputAndValueDtypesDies) {
   // Filling tensor with mismatched scalar should cause an assertion and kill
   // the test process.
   ET_EXPECT_KERNEL_FAILURE(
-      op_masked_fill_scalar_out(self, mask, /*value=*/1.3, out));
+      context_, op_masked_fill_scalar_out(self, mask, /*value=*/1.3, out));
 }
 
 // The output tensor may not have a dtype different from the inputs even if it
 // has the same shape.
-TEST(OpMaskedFillTest, MismatchedOutputDtypeDies) {
+TEST_F(OpMaskedFillTest, MismatchedOutputDtypeDies) {
   // Two different dtypes. This test uses two types with the same size to
   // demonstrate that the ScalarType itself matters, not the size of the
   // tensor elements.
@@ -176,10 +181,10 @@ TEST(OpMaskedFillTest, MismatchedOutputDtypeDies) {
   // Filling the tensor into a mismatched output should cause an assertion and
   // kill the test process.
   ET_EXPECT_KERNEL_FAILURE(
-      op_masked_fill_scalar_out(self, mask, /*fill=*/0, out));
+      context_, op_masked_fill_scalar_out(self, mask, /*fill=*/0, out));
 }
 // The mask tensor type must be bool, even if shapes are the same
-TEST(OpMaskedFillTest, MismatchedMaskDtypeDies) {
+TEST_F(OpMaskedFillTest, MismatchedMaskDtypeDies) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -194,11 +199,11 @@ TEST(OpMaskedFillTest, MismatchedMaskDtypeDies) {
   // Filling the tensor using non boolean mask should cause an assertion and
   // kill the test process.
   ET_EXPECT_KERNEL_FAILURE(
-      op_masked_fill_scalar_out(self, mask, /*fill=*/0, out));
+      context_, op_masked_fill_scalar_out(self, mask, /*fill=*/0, out));
 }
 
 // Mismatched shape tests.
-TEST(OpMaskedFillTest, MismatchedInputShapesDies) {
+TEST_F(OpMaskedFillTest, MismatchedInputShapesDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Bool> tf_bool;
 
@@ -212,10 +217,10 @@ TEST(OpMaskedFillTest, MismatchedInputShapesDies) {
   // Masked fill with mismatch input and mask shapes should cause an assertion
   // and kill the test process.
   ET_EXPECT_KERNEL_FAILURE(
-      op_masked_fill_scalar_out(self, mask, /*value=*/0, out));
+      context_, op_masked_fill_scalar_out(self, mask, /*value=*/0, out));
 }
 
-TEST(OpMaskedFillTest, BroadcastTest) {
+TEST_F(OpMaskedFillTest, BroadcastTest) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Bool> tf_bool;
 
@@ -237,7 +242,7 @@ TEST(OpMaskedFillTest, BroadcastTest) {
   EXPECT_TENSOR_CLOSE(out, tf.make({2, 2}, /*data=*/{3, 2, 3, 8}));
 }
 
-TEST(OpMaskedFillTest, MismatchedOutputShapesDies) {
+TEST_F(OpMaskedFillTest, MismatchedOutputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched output shape";
   }
@@ -255,10 +260,11 @@ TEST(OpMaskedFillTest, MismatchedOutputShapesDies) {
 
   // Mask filling the tensor into a mismatched output should cause an assertion
   // and kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_masked_fill_scalar_out(a, b, /*value=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_masked_fill_scalar_out(a, b, /*value=*/0, out));
 }
 
-TEST(OpMaskedFillTest, BroadcastDimSizeIsOneAB) {
+TEST_F(OpMaskedFillTest, BroadcastDimSizeIsOneAB) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> bool_tf;
 
@@ -285,7 +291,7 @@ TEST(OpMaskedFillTest, BroadcastDimSizeIsOneAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMaskedFillTest, BroadcastDimSizeMissingAB) {
+TEST_F(OpMaskedFillTest, BroadcastDimSizeMissingAB) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> bool_tf;
 
@@ -312,7 +318,7 @@ TEST(OpMaskedFillTest, BroadcastDimSizeMissingAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMaskedFillTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpMaskedFillTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> bool_tf;
 
@@ -340,7 +346,7 @@ TEST(OpMaskedFillTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMaskedFillTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpMaskedFillTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> bool_tf;
 
@@ -368,7 +374,7 @@ TEST(OpMaskedFillTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMaskedFillTest, DynamicShapeUnbound) {
+TEST_F(OpMaskedFillTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> bool_tf;
@@ -397,7 +403,7 @@ TEST(OpMaskedFillTest, DynamicShapeUnbound) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMaskedFillTest, BroadcastDimSizeIsOneBA) {
+TEST_F(OpMaskedFillTest, BroadcastDimSizeIsOneBA) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tf_bool;
 
@@ -425,7 +431,7 @@ TEST(OpMaskedFillTest, BroadcastDimSizeIsOneBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMaskedFillTest, BroadcastDimSizeMissingBA) {
+TEST_F(OpMaskedFillTest, BroadcastDimSizeMissingBA) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> tf_bool;
 
diff --git a/kernels/test/op_max_pool2d_with_indices_test.cpp b/kernels/test/op_max_pool2d_with_indices_test.cpp
index 5ad92d1d49..513bf9b97f 100644
--- a/kernels/test/op_max_pool2d_with_indices_test.cpp
+++ b/kernels/test/op_max_pool2d_with_indices_test.cpp
@@ -17,30 +17,32 @@
 
 using namespace ::testing;
 
-::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&>
-op_max_pool2d_with_indices_out(
-    const exec_aten::Tensor& self,
-    exec_aten::ArrayRef<int64_t> kernel_size,
-    exec_aten::ArrayRef<int64_t> stride,
-    exec_aten::ArrayRef<int64_t> padding,
-    exec_aten::ArrayRef<int64_t> dilation,
-    bool ceil_mode,
-    exec_aten::Tensor& out,
-    exec_aten::Tensor& indices) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::max_pool2d_with_indices_outf(
-      context,
-      self,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      ceil_mode,
-      out,
-      indices);
-}
+class OpMaxPool2DWithIndicesOutTest : public OperatorTest {
+ protected:
+  ::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&>
+  op_max_pool2d_with_indices_out(
+      const exec_aten::Tensor& self,
+      exec_aten::ArrayRef<int64_t> kernel_size,
+      exec_aten::ArrayRef<int64_t> stride,
+      exec_aten::ArrayRef<int64_t> padding,
+      exec_aten::ArrayRef<int64_t> dilation,
+      bool ceil_mode,
+      exec_aten::Tensor& out,
+      exec_aten::Tensor& indices) {
+    return torch::executor::aten::max_pool2d_with_indices_outf(
+        context_,
+        self,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode,
+        out,
+        indices);
+  }
+};
 
-TEST(OpMaxPool2DWithIndicesOutTest, SanityTest4D) {
+TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest4D) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Long> tfLong;
 
@@ -107,7 +109,7 @@ TEST(OpMaxPool2DWithIndicesOutTest, SanityTest4D) {
   EXPECT_TENSOR_CLOSE(indices, indices_expected);
 }
 
-TEST(OpMaxPool2DWithIndicesOutTest, SanityTest4D_2) {
+TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest4D_2) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Long> tfLong;
 
@@ -210,7 +212,7 @@ TEST(OpMaxPool2DWithIndicesOutTest, SanityTest4D_2) {
   EXPECT_TENSOR_CLOSE(indices, indices_expected);
 }
 
-TEST(OpMaxPool2DWithIndicesOutTest, SanityTest3D) {
+TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest3D) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Long> tfLong;
 
diff --git a/kernels/test/op_max_test.cpp b/kernels/test/op_max_test.cpp
index 1610268f72..1025aa76ab 100644
--- a/kernels/test/op_max_test.cpp
+++ b/kernels/test/op_max_test.cpp
@@ -23,51 +23,206 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-std::tuple<Tensor&, Tensor&> op_max_dim_max(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim,
-    Tensor& max,
-    Tensor& max_indices) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::max_outf(
-      context, self, dim, keepdim, max, max_indices);
-}
+class OpMaxOutTest : public OperatorTest {
+ protected:
+  std::tuple<Tensor&, Tensor&> op_max_dim_max(
+      const Tensor& self,
+      int64_t dim,
+      bool keepdim,
+      Tensor& max,
+      Tensor& max_indices) {
+    return torch::executor::aten::max_outf(
+        context_, self, dim, keepdim, max, max_indices);
+  }
 
-template <ScalarType IN_DTYPE>
-void test_max_out_invalid_dimensions() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<ScalarType::Long> tf_long;
+  template <ScalarType IN_DTYPE>
+  void test_max_out_invalid_dimensions() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<ScalarType::Long> tf_long;
+
+    Tensor self = tf_in.ones(/*sizes=*/{2, 3, 4});
+    Tensor max = tf_in.zeros({2, 3, 2});
+    Tensor max_indices = tf_in.zeros({2, 3});
+
+    // output tensor dim mismatch
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices));
+
+    // output tensor shape incorrect: size of dimension: dim should be 1
+    max = tf_in.zeros({2, 3, 2});
+    max_indices = tf_in.zeros({2, 3, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices));
+
+    // output tensor shape should be squeezed when keepdim is false
+    max = tf_in.zeros({2, 3, 1});
+    max_indices = tf_in.zeros({2, 3, 1});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/false, max, max_indices));
+
+    // invalid dim
+    max = tf_in.zeros({2, 3, 1});
+    max_indices = tf_in.zeros({2, 3, 1});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_max_dim_max(self, /*dim=*/3, /*keepdim=*/true, max, max_indices));
+  }
 
-  Tensor self = tf_in.ones(/*sizes=*/{2, 3, 4});
-  Tensor max = tf_in.zeros({2, 3, 2});
-  Tensor max_indices = tf_in.zeros({2, 3});
-
-  // output tensor dim mismatch
-  ET_EXPECT_DEATH(
-      op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices), "");
-
-  // output tensor shape incorrect: size of dimension: dim should be 1
-  max = tf_in.zeros({2, 3, 2});
-  max_indices = tf_in.zeros({2, 3, 2});
-  ET_EXPECT_DEATH(
-      op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices), "");
-
-  // output tensor shape should be squeezed when keepdim is false
-  max = tf_in.zeros({2, 3, 1});
-  max_indices = tf_in.zeros({2, 3, 1});
-  ET_EXPECT_DEATH(
-      op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/false, max, max_indices),
-      "");
-
-  // invalid dim
-  max = tf_in.zeros({2, 3, 1});
-  max_indices = tf_in.zeros({2, 3, 1});
-  ET_EXPECT_DEATH(
-      op_max_dim_max(self, /*dim=*/3, /*keepdim=*/true, max, max_indices), "");
-}
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(max_template) */
+
+    TensorFactory<ScalarType::Float> tf;
+    TensorFactory<ScalarType::Long> tfl;
+
+    Tensor input = tf.make(
+        {2, 3, 4},
+        {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
+         0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
+         0.4900934100151062,   0.8964447379112244,  0.455627977848053,
+         0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
+         0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
+         0.518521785736084,    0.6976675987243652,  0.800011396408081,
+         0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
+         0.9151939749717712,   0.39709991216659546, 0.8741558790206909});
+    Tensor expected_max = tf.make(
+        {2, 4},
+        {0.49625658988952637,
+         0.7682217955589294,
+         0.4900934100151062,
+         0.8964447379112244,
+         0.6976675987243652,
+         0.9151939749717712,
+         0.39709991216659546,
+         0.8741558790206909});
+    Tensor expected_max_indices = tfl.make({2, 4}, {0, 0, 1, 1, 1, 2, 2, 2});
+    Tensor max = tf.zeros(out_shape, dynamism);
+    Tensor max_indices = tfl.zeros(out_shape, dynamism);
+
+    op_max_dim_max(input, 1, false, max, max_indices);
+    EXPECT_TENSOR_EQ(max, expected_max);
+    EXPECT_TENSOR_EQ(max_indices, expected_max_indices);
+  }
+
+  template <ScalarType IN_DTYPE>
+  void test_max_out_dtype() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<ScalarType::Long> tf_long;
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2, 4,
+        4, 2, 1, 0,
+        1, 0, 4, 2,
+
+        4, 2, 1, 0,
+        0, 1, 2, 4,
+        1, 0, 4, 2,
+      });
+    // clang-format on
+
+    Tensor max = tf_in.zeros({2, 4});
+    Tensor max_indices = tf_long.zeros({2, 4});
+    op_max_dim_max(self, /*dim=*/1, /*keepdim=*/false, max, max_indices);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(max, tf_in.make(
+      {2, 4},
+      {
+        4, 2, 4, 4,
+        4, 2, 4, 4
+      }));
+
+    EXPECT_TENSOR_EQ(max_indices, tf_long.make(
+      {2, 4},
+      {
+        1, 1, 2, 0,
+        0, 0, 2, 1
+      }));
+    // clang-format on
+
+    // negative dim should work
+    op_max_dim_max(self, /*dim=*/-2, /*keepdim=*/false, max, max_indices);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(max, tf_in.make(
+      {2, 4},
+      {
+        4, 2, 4, 4,
+        4, 2, 4, 4
+      }));
+    EXPECT_TENSOR_EQ(max_indices, tf_long.make(
+      {2, 4},
+      {
+        1, 1, 2, 0,
+        0, 0, 2, 1
+      }));
+    // clang-format on
+
+    // keepdim should work
+    max = tf_in.zeros({2, 3, 1});
+    max_indices = tf_long.zeros({2, 3, 1});
+    op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices);
+    EXPECT_TENSOR_CLOSE(max, tf_in.make({2, 3, 1}, {4, 4, 4, 4, 4, 4}));
+    EXPECT_TENSOR_EQ(max_indices, tf_long.make({2, 3, 1}, {3, 0, 2, 0, 3, 2}));
+  }
+
+  template <>
+  void test_max_out_dtype<ScalarType::Bool>() {
+    TensorFactory<ScalarType::Bool> tf_bool;
+    TensorFactory<ScalarType::Long> tf_long;
+    // clang-format off
+    Tensor self = tf_bool.make(
+      {2, 3, 4},
+      {
+        true,  false, true,  false,
+        false, false, false, false,
+        false, true,  true,  false,
+
+        false, false, true,  false,
+        false, false, false, true,
+        true,  true,  true,  true,
+      });
+    // clang-format on
+
+    Tensor max = tf_bool.zeros({2, 3, 1});
+    Tensor max_indices = tf_long.zeros({2, 3, 1});
+
+    // +/-inf and nan should work
+    op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(
+        max, tf_bool.make(
+          {2, 3, 1},
+          {
+            true,
+            false,
+            true,
+
+            true,
+            true,
+            true
+          }));
+    EXPECT_TENSOR_EQ(max_indices, tf_long.make(
+      {2, 3, 1},
+      {
+        0,
+        0,
+        1,
+
+        2,
+        3,
+        0
+      }));
+    // clang-format on
+  }
+};
 
-TEST(OpMaxOutTest, MismatchedDimensionsDies) {
+TEST_F(OpMaxOutTest, MismatchedDimensionsDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -77,7 +232,7 @@ TEST(OpMaxOutTest, MismatchedDimensionsDies) {
 #undef TEST_ENTRY
 }
 
-TEST(OpMaxOutTest, MismatchedDTypesDies) {
+TEST_F(OpMaxOutTest, MismatchedDTypesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -89,135 +244,25 @@ TEST(OpMaxOutTest, MismatchedDTypesDies) {
   Tensor max_indices = tf_long.zeros({2, 3, 1});
 
   // dtype of self and max should match
-  ET_EXPECT_DEATH(
-      op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices), "");
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices));
 
   // max_value tensor should have long as dtype
   max = tf_float.zeros({2, 3, 1});
   max_indices = tf_float.zeros({2, 3, 1});
-  ET_EXPECT_DEATH(
-      op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices), "");
-}
-
-template <ScalarType IN_DTYPE>
-void test_max_out_dtype() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<ScalarType::Long> tf_long;
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2, 4,
-      4, 2, 1, 0,
-      1, 0, 4, 2,
-
-      4, 2, 1, 0,
-      0, 1, 2, 4,
-      1, 0, 4, 2,
-    });
-  // clang-format on
-
-  Tensor max = tf_in.zeros({2, 4});
-  Tensor max_indices = tf_long.zeros({2, 4});
-  op_max_dim_max(self, /*dim=*/1, /*keepdim=*/false, max, max_indices);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(max, tf_in.make(
-    {2, 4},
-    {
-      4, 2, 4, 4,
-      4, 2, 4, 4
-    }));
-
-  EXPECT_TENSOR_EQ(max_indices, tf_long.make(
-    {2, 4},
-    {
-      1, 1, 2, 0,
-      0, 0, 2, 1
-    }));
-  // clang-format on
-
-  // negative dim should work
-  op_max_dim_max(self, /*dim=*/-2, /*keepdim=*/false, max, max_indices);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(max, tf_in.make(
-    {2, 4},
-    {
-      4, 2, 4, 4,
-      4, 2, 4, 4
-    }));
-  EXPECT_TENSOR_EQ(max_indices, tf_long.make(
-    {2, 4},
-    {
-      1, 1, 2, 0,
-      0, 0, 2, 1
-    }));
-  // clang-format on
-
-  // keepdim should work
-  max = tf_in.zeros({2, 3, 1});
-  max_indices = tf_long.zeros({2, 3, 1});
-  op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices);
-  EXPECT_TENSOR_CLOSE(max, tf_in.make({2, 3, 1}, {4, 4, 4, 4, 4, 4}));
-  EXPECT_TENSOR_EQ(max_indices, tf_long.make({2, 3, 1}, {3, 0, 2, 0, 3, 2}));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices));
 }
 
-template <>
-void test_max_out_dtype<ScalarType::Bool>() {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Long> tf_long;
-  // clang-format off
-  Tensor self = tf_bool.make(
-    {2, 3, 4},
-    {
-      true,  false, true,  false,
-      false, false, false, false,
-      false, true,  true,  false,
-
-      false, false, true,  false,
-      false, false, false, true,
-      true,  true,  true,  true,
-    });
-  // clang-format on
-
-  Tensor max = tf_bool.zeros({2, 3, 1});
-  Tensor max_indices = tf_long.zeros({2, 3, 1});
-
-  // +/-inf and nan should work
-  op_max_dim_max(self, /*dim=*/-1, /*keepdim=*/true, max, max_indices);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(
-      max, tf_bool.make(
-        {2, 3, 1},
-        {
-          true,
-          false,
-          true,
-
-          true,
-          true,
-          true
-        }));
-  EXPECT_TENSOR_EQ(max_indices, tf_long.make(
-    {2, 3, 1},
-    {
-      0,
-      0,
-      1,
-
-      2,
-      3,
-      0
-    }));
-  // clang-format on
-}
-
-TEST(OpMaxOutTest, AllRealInputLongOutputPasses) {
+TEST_F(OpMaxOutTest, AllRealInputLongOutputPasses) {
 #define TEST_ENTRY(ctype, dtype) test_max_out_dtype<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpMaxOutTest, InfinityAndNANTest) {
+TEST_F(OpMaxOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
   // clang-format off
@@ -278,55 +323,17 @@ max_template = f"""
   EXPECT_TENSOR_EQ(max, expected_max);
   EXPECT_TENSOR_EQ(max_indices, expected_max_indices);""" */
 
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(max_template) */
-
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<ScalarType::Long> tfl;
-
-  Tensor input = tf.make(
-      {2, 3, 4},
-      {0.49625658988952637,  0.7682217955589294,  0.08847743272781372,
-       0.13203048706054688,  0.30742281675338745, 0.6340786814689636,
-       0.4900934100151062,   0.8964447379112244,  0.455627977848053,
-       0.6323062777519226,   0.3488934636116028,  0.40171730518341064,
-       0.022325754165649414, 0.16885894536972046, 0.2938884496688843,
-       0.518521785736084,    0.6976675987243652,  0.800011396408081,
-       0.16102945804595947,  0.28226858377456665, 0.6816085577011108,
-       0.9151939749717712,   0.39709991216659546, 0.8741558790206909});
-  Tensor expected_max = tf.make(
-      {2, 4},
-      {0.49625658988952637,
-       0.7682217955589294,
-       0.4900934100151062,
-       0.8964447379112244,
-       0.6976675987243652,
-       0.9151939749717712,
-       0.39709991216659546,
-       0.8741558790206909});
-  Tensor expected_max_indices = tfl.make({2, 4}, {0, 0, 1, 1, 1, 2, 2, 2});
-  Tensor max = tf.zeros(out_shape, dynamism);
-  Tensor max_indices = tfl.zeros(out_shape, dynamism);
-
-  op_max_dim_max(input, 1, false, max, max_indices);
-  EXPECT_TENSOR_EQ(max, expected_max);
-  EXPECT_TENSOR_EQ(max_indices, expected_max_indices);
-}
-
-TEST(OpMaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpMaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpMaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpMaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpMaxOutTest, DynamicShapeUnbound) {
+TEST_F(OpMaxOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp
index 136850e715..a6cc64d714 100644
--- a/kernels/test/op_mean_test.cpp
+++ b/kernels/test/op_mean_test.cpp
@@ -24,52 +24,223 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_mean_out(
-    const Tensor& self,
-    optional<ArrayRef<int64_t>> dim,
-    bool keepdim,
-    optional<ScalarType> dtype,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::mean_outf(
-      context, self, dim, keepdim, dtype, out);
-}
+class OpMeanOutTest : public OperatorTest {
+ protected:
+  Tensor& op_mean_out(
+      const Tensor& self,
+      optional<ArrayRef<int64_t>> dim,
+      bool keepdim,
+      optional<ScalarType> dtype,
+      Tensor& out) {
+    return torch::executor::aten::mean_outf(
+        context_, self, dim, keepdim, dtype, out);
+  }
 
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_mean_dim_out_invalid_dimensions() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_mean_dim_out_invalid_dimensions() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2,  3,
+        4, 5, 6,  7,
+        8, 9, 10, 11,
+
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+      });
+    // clang-format on
+    Tensor out = tf_out.zeros({2, 3, 1});
+    optional<ScalarType> dtype = OUT_DTYPE;
+
+    // out-of-bound dim in dim list
+    int64_t dims_1[1] = {3};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out));
+
+    // the same dim appears multiple times in list of dims
+    int64_t dims_2[2] = {2, 2};
+    optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out));
+  }
 
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2,  3,
-      4, 5, 6,  7,
-      8, 9, 10, 11,
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_mean_dim_out_invalid_shape() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2,  3,
+        4, 5, 6,  7,
+        8, 9, 10, 11,
+
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+      });
+    // clang-format on
+
+    // dimension size mismatch when keepdim is true
+    Tensor out = tf_out.zeros({2, 4});
+    optional<ScalarType> dtype = OUT_DTYPE;
+    int64_t dims_1[1] = {1};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out));
+
+    // dimension size mismatch when keepdim is false
+    out = tf_out.zeros({2, 1, 4});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_mean_out(self, optional_dim_list, /*keepdim=*/false, dtype, out));
+  }
 
-      12, 13, 14, 15,
-      16, 17, 18, 19,
-      20, 21, 22, 23,
-    });
-  // clang-format on
-  Tensor out = tf_out.zeros({2, 3, 1});
-  optional<ScalarType> dtype = OUT_DTYPE;
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_mean_dim_out_dtype() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2,  3,
+        4, 5, 6,  7,
+        8, 9, 10, 11,
+
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+      });
+    // clang-format on
+
+    // keepdim=true should work
+    Tensor out = tf_out.zeros({2, 3, 1});
+    int64_t dims_1[1] = {2};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    optional<ScalarType> dtype = OUT_DTYPE;
+    op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 3, 1},
+      {
+        1.5,
+        5.5,
+        9.5,
+
+        13.5,
+        17.5,
+        21.5
+      }));
+    // clang-format on
+
+    // keepdim=false should work
+    out = tf_out.zeros({2, 3});
+    op_mean_out(self, optional_dim_list, /*keepdim=*/false, dtype, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 3},
+      {
+        1.5,  5.5,  9.5,
+        13.5, 17.5, 21.5
+      }));
+    // clang-format on
+
+    // dim list with multiple dimensions should work
+    out = tf_out.zeros({1, 1, 4});
+    int64_t dims_2[2] = {0, 1};
+    optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
+    op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 4}, {10, 11, 12, 13}));
+
+    out = tf_out.zeros({4});
+    op_mean_out(self, optional_dim_list, false, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({4}, {10, 11, 12, 13}));
+
+    // dim list with negative dimensions should work
+    out = tf_out.zeros({2, 1, 4});
+    int64_t dims_3[1] = {-2};
+    optional_dim_list = ArrayRef<int64_t>{dims_3, 1};
+    op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 1, 4},
+      {
+        4,  5,  6,  7,
+
+        16, 17, 18, 19,
+      }));
+    // clang-format on
+
+    // empty/null dim list should work
+    out = tf_out.zeros({1, 1, 1});
+    optional<ArrayRef<int64_t>> null_dim_list;
+    op_mean_out(self, null_dim_list, /*keepdim=*/true, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {11.5}));
+
+    optional<ArrayRef<int64_t>> empty_dim_list{ArrayRef<int64_t>{}};
+    op_mean_out(self, empty_dim_list, /*keepdim=*/true, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {11.5}));
+
+    out = tf_out.zeros({});
+    op_mean_out(self, null_dim_list, /*keepdim=*/false, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {11.5}));
+
+    op_mean_out(self, empty_dim_list, /*keepdim=*/false, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {11.5}));
+  }
 
-  // out-of-bound dim in dim list
-  int64_t dims_1[1] = {3};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(
-      op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out), "");
-
-  // the same dim appears multiple times in list of dims
-  int64_t dims_2[2] = {2, 2};
-  optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
-  ET_EXPECT_DEATH(
-      op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out), "");
-}
+  template <ScalarType OUT_DTYPE>
+  void test_mean_dim_out_bool() {
+    TensorFactory<ScalarType::Bool> tf_bool;
+    TensorFactory<OUT_DTYPE> tf_float;
+    // clang-format off
+    Tensor self = tf_bool.make(
+      {2, 3, 4},
+      {
+        true,  false, true,  false,
+        false, false, false, false,
+        false, true,  true,  false,
+
+        false, false, true,  false,
+        false, false, false, true,
+        true,  true,  true,  true,
+      });
+    // clang-format on
+
+    Tensor out = tf_float.zeros({1, 1, 4});
+    int64_t dims[2] = {0, 1};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims, 2}};
+    optional<ScalarType> dtype = OUT_DTYPE;
+    op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_float.make({1, 1, 4}, {0.333333, 0.333333, 0.666667, 0.333333}));
+  }
 
-TEST(OpMeanOutTest, InvalidDimensionListDies) {
+  template <>
+  void test_mean_dim_out_dtype<ScalarType::Bool, ScalarType::Float>() {
+    test_mean_dim_out_bool<ScalarType::Float>();
+  }
+
+  template <>
+  void test_mean_dim_out_dtype<ScalarType::Bool, ScalarType::Double>() {
+    test_mean_dim_out_bool<ScalarType::Double>();
+  }
+};
+
+TEST_F(OpMeanOutTest, InvalidDimensionListDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -87,40 +258,7 @@ TEST(OpMeanOutTest, InvalidDimensionListDies) {
 #undef TEST_KERNEL
 }
 
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_mean_dim_out_invalid_shape() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2,  3,
-      4, 5, 6,  7,
-      8, 9, 10, 11,
-
-      12, 13, 14, 15,
-      16, 17, 18, 19,
-      20, 21, 22, 23,
-    });
-  // clang-format on
-
-  // dimension size mismatch when keepdim is true
-  Tensor out = tf_out.zeros({2, 4});
-  optional<ScalarType> dtype = OUT_DTYPE;
-  int64_t dims_1[1] = {1};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(
-      op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out), "");
-
-  // dimension size mismatch when keepdim is false
-  out = tf_out.zeros({2, 1, 4});
-  ET_EXPECT_DEATH(
-      op_mean_out(self, optional_dim_list, /*keepdim=*/false, dtype, out), "");
-}
-
-TEST(OpMeanOutTest, InvalidShapeDies) {
+TEST_F(OpMeanOutTest, InvalidShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -138,7 +276,7 @@ TEST(OpMeanOutTest, InvalidShapeDies) {
 #undef TEST_KERNEL
 }
 
-TEST(OpMeanOutTest, MismatchedDTypesDies) {
+TEST_F(OpMeanOutTest, MismatchedDTypesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -166,147 +304,18 @@ TEST(OpMeanOutTest, MismatchedDTypesDies) {
   optional<ScalarType> dtype;
 
   // self tensor must have a floating point dtype when dtype is not specified
-  ET_EXPECT_DEATH(
-      op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out), "");
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out));
 
   dtype = ScalarType::Double;
   // out tensor should be of the same dtype with dtype when dtype is specified
-  ET_EXPECT_DEATH(
-      op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out), "");
-}
-
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_mean_dim_out_dtype() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2,  3,
-      4, 5, 6,  7,
-      8, 9, 10, 11,
-
-      12, 13, 14, 15,
-      16, 17, 18, 19,
-      20, 21, 22, 23,
-    });
-  // clang-format on
-
-  // keepdim=true should work
-  Tensor out = tf_out.zeros({2, 3, 1});
-  int64_t dims_1[1] = {2};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  optional<ScalarType> dtype = OUT_DTYPE;
-  op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 3, 1},
-    {
-      1.5,
-      5.5,
-      9.5,
-
-      13.5,
-      17.5,
-      21.5
-    }));
-  // clang-format on
-
-  // keepdim=false should work
-  out = tf_out.zeros({2, 3});
-  op_mean_out(self, optional_dim_list, /*keepdim=*/false, dtype, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 3},
-    {
-      1.5,  5.5,  9.5,
-      13.5, 17.5, 21.5
-    }));
-  // clang-format on
-
-  // dim list with multiple dimensions should work
-  out = tf_out.zeros({1, 1, 4});
-  int64_t dims_2[2] = {0, 1};
-  optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
-  op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 4}, {10, 11, 12, 13}));
-
-  out = tf_out.zeros({4});
-  op_mean_out(self, optional_dim_list, false, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({4}, {10, 11, 12, 13}));
-
-  // dim list with negative dimensions should work
-  out = tf_out.zeros({2, 1, 4});
-  int64_t dims_3[1] = {-2};
-  optional_dim_list = ArrayRef<int64_t>{dims_3, 1};
-  op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 1, 4},
-    {
-      4,  5,  6,  7,
-
-      16, 17, 18, 19,
-    }));
-  // clang-format on
-
-  // empty/null dim list should work
-  out = tf_out.zeros({1, 1, 1});
-  optional<ArrayRef<int64_t>> null_dim_list;
-  op_mean_out(self, null_dim_list, /*keepdim=*/true, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {11.5}));
-
-  optional<ArrayRef<int64_t>> empty_dim_list{ArrayRef<int64_t>{}};
-  op_mean_out(self, empty_dim_list, /*keepdim=*/true, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {11.5}));
-
-  out = tf_out.zeros({});
-  op_mean_out(self, null_dim_list, /*keepdim=*/false, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {11.5}));
-
-  op_mean_out(self, empty_dim_list, /*keepdim=*/false, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {11.5}));
-}
-
-template <ScalarType OUT_DTYPE>
-void test_mean_dim_out_bool() {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<OUT_DTYPE> tf_float;
-  // clang-format off
-  Tensor self = tf_bool.make(
-    {2, 3, 4},
-    {
-      true,  false, true,  false,
-      false, false, false, false,
-      false, true,  true,  false,
-
-      false, false, true,  false,
-      false, false, false, true,
-      true,  true,  true,  true,
-    });
-  // clang-format on
-
-  Tensor out = tf_float.zeros({1, 1, 4});
-  int64_t dims[2] = {0, 1};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims, 2}};
-  optional<ScalarType> dtype = OUT_DTYPE;
-  op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
-  EXPECT_TENSOR_CLOSE(
-      out, tf_float.make({1, 1, 4}, {0.333333, 0.333333, 0.666667, 0.333333}));
-}
-
-template <>
-void test_mean_dim_out_dtype<ScalarType::Bool, ScalarType::Float>() {
-  test_mean_dim_out_bool<ScalarType::Float>();
-}
-
-template <>
-void test_mean_dim_out_dtype<ScalarType::Bool, ScalarType::Double>() {
-  test_mean_dim_out_bool<ScalarType::Double>();
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_mean_out(self, optional_dim_list, /*keepdim=*/true, dtype, out));
 }
 
-TEST(OpMeanOutTest, AllRealInputFloatOutputPasses) {
+TEST_F(OpMeanOutTest, AllRealInputFloatOutputPasses) {
   // Use a two layer switch to hanldle each possible data pair
 #define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
   test_mean_dim_out_dtype<ScalarType::INPUT_DTYPE, ScalarType::OUTPUT_DTYPE>();
@@ -319,7 +328,7 @@ TEST(OpMeanOutTest, AllRealInputFloatOutputPasses) {
 #undef TEST_KERNEL
 }
 
-TEST(OpMeanOutTest, HalfSupport) {
+TEST_F(OpMeanOutTest, HalfSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -334,7 +343,7 @@ TEST(OpMeanOutTest, HalfSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpMeanOutTest, InfinityAndNANTest) {
+TEST_F(OpMeanOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Float> tf_float;
   // clang-format off
   Tensor self = tf_float.make(
@@ -370,7 +379,7 @@ TEST(OpMeanOutTest, InfinityAndNANTest) {
   // clang-format on
 }
 
-TEST(OpMeanOutTest, SimpleGeneratedCase) {
+TEST_F(OpMeanOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -392,7 +401,7 @@ TEST(OpMeanOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMeanOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpMeanOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -413,7 +422,7 @@ TEST(OpMeanOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMeanOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpMeanOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -434,7 +443,7 @@ TEST(OpMeanOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMeanOutTest, DynamicShapeUnbound) {
+TEST_F(OpMeanOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_min_test.cpp b/kernels/test/op_min_test.cpp
index 797e6b6317..50c0c57b7f 100644
--- a/kernels/test/op_min_test.cpp
+++ b/kernels/test/op_min_test.cpp
@@ -23,86 +23,93 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-std::tuple<Tensor&, Tensor&> op_min_dim_min(
-    const Tensor& in,
-    int64_t dim,
-    bool keepdim,
-    Tensor& min,
-    Tensor& min_indices) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::min_outf(
-      context, in, dim, keepdim, min, min_indices);
-}
+class OpMinOutTest : public OperatorTest {
+ protected:
+  std::tuple<Tensor&, Tensor&> op_min_dim_min(
+      const Tensor& in,
+      int64_t dim,
+      bool keepdim,
+      Tensor& min,
+      Tensor& min_indices) {
+    return torch::executor::aten::min_outf(
+        context_, in, dim, keepdim, min, min_indices);
+  }
 
-template <ScalarType IN_DTYPE>
-void test_min_out_invalid_dimensions() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<ScalarType::Long> tf_long;
+  template <ScalarType IN_DTYPE>
+  void test_min_out_invalid_dimensions() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<ScalarType::Long> tf_long;
+
+    Tensor in = tf_in.ones(/*sizes=*/{2, 3, 4});
+    Tensor min = tf_in.zeros({2, 3, 2});
+    Tensor min_indices = tf_in.zeros({2, 3});
+
+    // output tensor dim mismatch
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices));
+
+    // output tensor shape incorrect: size of dimension: dim should be 1
+    min = tf_in.zeros({2, 3, 2});
+    min_indices = tf_in.zeros({2, 3, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices));
+
+    // output tensor shape should be squeezed when keepdim is false
+    min = tf_in.zeros({2, 3, 1});
+    min_indices = tf_in.zeros({2, 3, 1});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/false, min, min_indices));
+
+    // invalid dim
+    min = tf_in.zeros({2, 3, 1});
+    min_indices = tf_in.zeros({2, 3, 1});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_min_dim_min(in, /*dim=*/3, /*keepdim=*/true, min, min_indices));
+  }
 
-  Tensor in = tf_in.ones(/*sizes=*/{2, 3, 4});
-  Tensor min = tf_in.zeros({2, 3, 2});
-  Tensor min_indices = tf_in.zeros({2, 3});
-
-  // output tensor dim mismatch
-  ET_EXPECT_DEATH(
-      op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices), "");
-
-  // output tensor shape incorrect: size of dimension: dim should be 1
-  min = tf_in.zeros({2, 3, 2});
-  min_indices = tf_in.zeros({2, 3, 2});
-  ET_EXPECT_DEATH(
-      op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices), "");
-
-  // output tensor shape should be squeezed when keepdim is false
-  min = tf_in.zeros({2, 3, 1});
-  min_indices = tf_in.zeros({2, 3, 1});
-  ET_EXPECT_DEATH(
-      op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/false, min, min_indices), "");
-
-  // invalid dim
-  min = tf_in.zeros({2, 3, 1});
-  min_indices = tf_in.zeros({2, 3, 1});
-  ET_EXPECT_DEATH(
-      op_min_dim_min(in, /*dim=*/3, /*keepdim=*/true, min, min_indices), "");
-}
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(min_template) */
 
-TEST(OpMinOutTest, MismatchedDimensionsDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel test fails";
-  }
-#define TEST_ENTRY(ctype, dtype) \
-  test_min_out_invalid_dimensions<ScalarType::dtype>();
-  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
-#undef TEST_ENTRY
-}
+    TensorFactory<ScalarType::Float> tf;
+    TensorFactory<ScalarType::Long> tfl;
 
-TEST(OpMinOutTest, MismatchedDTypesDies) {
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    GTEST_SKIP() << "ATen kernel test fails";
-  }
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<ScalarType::Long> tf_long;
+    // clang-format off
+  Tensor input = tf.make(
+      {2, 3, 4},
+      {0.49, 0.76, 0.08, 0.13,
+       0.30, 0.63, 0.49, 0.89,
+       0.45, 0.63, 0.34, 0.40,
 
-  Tensor in = tf_float.ones(/*sizes=*/{2, 3, 4});
-  Tensor min = tf_long.zeros({2, 3, 1});
-  Tensor min_indices = tf_long.zeros({2, 3, 1});
+       0.02, 0.16, 0.29, 0.51,
+       0.69, 0.80, 0.16, 0.28,
+       0.68, 0.91, 0.39, 0.87});
+  Tensor expected_min = tf.make(
+      {2, 4},
+      {0.30, 0.63, 0.08, 0.13,
+       0.02, 0.16, 0.16, 0.28});
+    // clang-format on
 
-  // dtype of in and min should match
-  ET_EXPECT_DEATH(
-      op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices), "");
+    Tensor expected_min_indices = tfl.make({2, 4}, {1, 1, 0, 0, 0, 0, 1, 1});
+    Tensor min = tf.zeros(out_shape, dynamism);
+    Tensor min_indices = tfl.zeros(out_shape, dynamism);
 
-  // min_value tensor should have long as dtype
-  min = tf_float.zeros({2, 3, 1});
-  min_indices = tf_float.zeros({2, 3, 1});
-  ET_EXPECT_DEATH(
-      op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices), "");
-}
+    op_min_dim_min(input, 1, false, min, min_indices);
+    EXPECT_TENSOR_EQ(min, expected_min);
+    EXPECT_TENSOR_EQ(min_indices, expected_min_indices);
+  }
 
-template <ScalarType IN_DTYPE>
-void test_min_out_dtype() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<ScalarType::Long> tf_long;
-  // clang-format off
+  template <ScalarType IN_DTYPE>
+  void test_min_out_dtype() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<ScalarType::Long> tf_long;
+    // clang-format off
   Tensor in = tf_in.make(
     {2, 3, 4},
     {
@@ -114,12 +121,12 @@ void test_min_out_dtype() {
       0, 1, 2, 4,
       1, 0, 4, 2,
     });
-  // clang-format on
+    // clang-format on
 
-  Tensor min = tf_in.zeros({2, 4});
-  Tensor min_indices = tf_long.zeros({2, 4});
-  op_min_dim_min(in, /*dim=*/1, /*keepdim=*/false, min, min_indices);
-  // clang-format off
+    Tensor min = tf_in.zeros({2, 4});
+    Tensor min_indices = tf_long.zeros({2, 4});
+    op_min_dim_min(in, /*dim=*/1, /*keepdim=*/false, min, min_indices);
+    // clang-format off
   EXPECT_TENSOR_CLOSE(min, tf_in.make(
     {2, 4},
     {
@@ -133,11 +140,11 @@ void test_min_out_dtype() {
       0, 2, 1, 1,
       1, 2, 0, 0
     }));
-  // clang-format on
+    // clang-format on
 
-  // negative dim should work
-  op_min_dim_min(in, /*dim=*/-2, /*keepdim=*/false, min, min_indices);
-  // clang-format off
+    // negative dim should work
+    op_min_dim_min(in, /*dim=*/-2, /*keepdim=*/false, min, min_indices);
+    // clang-format off
   EXPECT_TENSOR_CLOSE(min, tf_in.make(
     {2, 4},
     {
@@ -150,21 +157,21 @@ void test_min_out_dtype() {
       0, 2, 1, 1,
       1, 2, 0, 0
     }));
-  // clang-format on
-
-  // keepdim should work
-  min = tf_in.zeros({2, 3, 1});
-  min_indices = tf_long.zeros({2, 3, 1});
-  op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices);
-  EXPECT_TENSOR_CLOSE(min, tf_in.make({2, 3, 1}, {0, 0, 0, 0, 0, 0}));
-  EXPECT_TENSOR_EQ(min_indices, tf_long.make({2, 3, 1}, {0, 3, 1, 3, 0, 1}));
-}
+    // clang-format on
+
+    // keepdim should work
+    min = tf_in.zeros({2, 3, 1});
+    min_indices = tf_long.zeros({2, 3, 1});
+    op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices);
+    EXPECT_TENSOR_CLOSE(min, tf_in.make({2, 3, 1}, {0, 0, 0, 0, 0, 0}));
+    EXPECT_TENSOR_EQ(min_indices, tf_long.make({2, 3, 1}, {0, 3, 1, 3, 0, 1}));
+  }
 
-template <>
-void test_min_out_dtype<ScalarType::Bool>() {
-  TensorFactory<ScalarType::Bool> tf_bool;
-  TensorFactory<ScalarType::Long> tf_long;
-  // clang-format off
+  template <>
+  void test_min_out_dtype<ScalarType::Bool>() {
+    TensorFactory<ScalarType::Bool> tf_bool;
+    TensorFactory<ScalarType::Long> tf_long;
+    // clang-format off
   Tensor in = tf_bool.make(
     {2, 3, 4},
     {
@@ -176,14 +183,14 @@ void test_min_out_dtype<ScalarType::Bool>() {
       false, false, false, true,
       true,  true,  true,  true,
     });
-  // clang-format on
+    // clang-format on
 
-  Tensor min = tf_bool.zeros({2, 3, 1});
-  Tensor min_indices = tf_long.zeros({2, 3, 1});
+    Tensor min = tf_bool.zeros({2, 3, 1});
+    Tensor min_indices = tf_long.zeros({2, 3, 1});
 
-  // +/-inf and nan should work
-  op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices);
-  // clang-format off
+    // +/-inf and nan should work
+    op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices);
+    // clang-format off
   EXPECT_TENSOR_CLOSE(
       min, tf_bool.make(
         {2, 3, 1},
@@ -207,16 +214,51 @@ void test_min_out_dtype<ScalarType::Bool>() {
       0,
       0
     }));
-  // clang-format on
+    // clang-format on
+  }
+};
+
+TEST_F(OpMinOutTest, MismatchedDimensionsDies) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel test fails";
+  }
+#define TEST_ENTRY(ctype, dtype) \
+  test_min_out_invalid_dimensions<ScalarType::dtype>();
+  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpMinOutTest, MismatchedDTypesDies) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel test fails";
+  }
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  Tensor in = tf_float.ones(/*sizes=*/{2, 3, 4});
+  Tensor min = tf_long.zeros({2, 3, 1});
+  Tensor min_indices = tf_long.zeros({2, 3, 1});
+
+  // dtype of in and min should match
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices));
+
+  // min_value tensor should have long as dtype
+  min = tf_float.zeros({2, 3, 1});
+  min_indices = tf_float.zeros({2, 3, 1});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_min_dim_min(in, /*dim=*/-1, /*keepdim=*/true, min, min_indices));
 }
 
-TEST(OpMinOutTest, AllRealInputLongOutputPasses) {
+TEST_F(OpMinOutTest, AllRealInputLongOutputPasses) {
 #define TEST_ENTRY(ctype, dtype) test_min_out_dtype<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpMinOutTest, InfinityAndNANTest) {
+TEST_F(OpMinOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Float> tf_float;
   TensorFactory<ScalarType::Long> tf_long;
   // clang-format off
@@ -255,51 +297,17 @@ TEST(OpMinOutTest, InfinityAndNANTest) {
   // clang-format on
 }
 
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(min_template) */
-
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<ScalarType::Long> tfl;
-
-  // clang-format off
-  Tensor input = tf.make(
-      {2, 3, 4},
-      {0.49, 0.76, 0.08, 0.13,
-       0.30, 0.63, 0.49, 0.89,
-       0.45, 0.63, 0.34, 0.40,
-
-       0.02, 0.16, 0.29, 0.51,
-       0.69, 0.80, 0.16, 0.28,
-       0.68, 0.91, 0.39, 0.87});
-  Tensor expected_min = tf.make(
-      {2, 4},
-      {0.30, 0.63, 0.08, 0.13,
-       0.02, 0.16, 0.16, 0.28});
-  // clang-format on
-
-  Tensor expected_min_indices = tfl.make({2, 4}, {1, 1, 0, 0, 0, 0, 1, 1});
-  Tensor min = tf.zeros(out_shape, dynamism);
-  Tensor min_indices = tfl.zeros(out_shape, dynamism);
-
-  op_min_dim_min(input, 1, false, min, min_indices);
-  EXPECT_TENSOR_EQ(min, expected_min);
-  EXPECT_TENSOR_EQ(min_indices, expected_min_indices);
-}
-
-TEST(OpMinOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpMinOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpMinOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpMinOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpMinOutTest, DynamicShapeUnbound) {
+TEST_F(OpMinOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_minimum_test.cpp b/kernels/test/op_minimum_test.cpp
index 477d873eae..be43e0af07 100644
--- a/kernels/test/op_minimum_test.cpp
+++ b/kernels/test/op_minimum_test.cpp
@@ -20,58 +20,60 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_minimum_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::minimum_outf(context, self, other, out);
-}
+class OpMinimumOutTest : public OperatorTest {
+ protected:
+  Tensor& op_minimum_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::minimum_outf(context_, self, other, out);
+  }
 
-// Common testing for minimum operator
-template <ScalarType DTYPE>
-void test_minimum_out_same_size() {
-  TensorFactory<DTYPE> tf;
-  const std::vector<int32_t> sizes = {2, 2};
+  // Common testing for minimum operator
+  template <ScalarType DTYPE>
+  void test_minimum_out_same_size() {
+    TensorFactory<DTYPE> tf;
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the minimum operator.
-  Tensor out = tf.zeros(sizes);
+    // Destination for the minimum operator.
+    Tensor out = tf.zeros(sizes);
 
-  op_minimum_out(
-      tf.make(sizes, /*data=*/{1, 2, 4, 8}),
-      tf.make(sizes, /*data=*/{3, 0, 4, 9}),
-      out);
+    op_minimum_out(
+        tf.make(sizes, /*data=*/{1, 2, 4, 8}),
+        tf.make(sizes, /*data=*/{3, 0, 4, 9}),
+        out);
 
-  // Check that it matches to the expected output.
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{1, 0, 4, 8}));
-}
+    // Check that it matches to the expected output.
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{1, 0, 4, 8}));
+  }
+};
 
-TEST(OpMinimumOutKernelTest, ByteTensors) {
+TEST_F(OpMinimumOutTest, ByteTensors) {
   test_minimum_out_same_size<ScalarType::Byte>();
 }
 
-TEST(OpMinimumOutKernelTest, CharTensors) {
+TEST_F(OpMinimumOutTest, CharTensors) {
   test_minimum_out_same_size<ScalarType::Char>();
 }
 
-TEST(OpMinimumOutKernelTest, ShortTensors) {
+TEST_F(OpMinimumOutTest, ShortTensors) {
   test_minimum_out_same_size<ScalarType::Short>();
 }
 
-TEST(OpMinimumOutKernelTest, IntTensors) {
+TEST_F(OpMinimumOutTest, IntTensors) {
   test_minimum_out_same_size<ScalarType::Int>();
 }
 
-TEST(OpMinimumOutKernelTest, LongTensors) {
+TEST_F(OpMinimumOutTest, LongTensors) {
   test_minimum_out_same_size<ScalarType::Long>();
 }
 
-TEST(OpMinimumOutKernelTest, FloatTensors) {
+TEST_F(OpMinimumOutTest, FloatTensors) {
   test_minimum_out_same_size<ScalarType::Float>();
 }
 
-TEST(OpMinimumOutKernelTest, DoubleTensors) {
+TEST_F(OpMinimumOutTest, DoubleTensors) {
   test_minimum_out_same_size<ScalarType::Double>();
 }
 
-TEST(OpMinimumOutKernelTest, BothScalarTensors) {
+TEST_F(OpMinimumOutTest, BothScalarTensors) {
   // Checks the case when both cases are scalar.
   TensorFactory<ScalarType::Float> tf;
   const std::vector<int32_t> sizes = {1, 1};
@@ -80,7 +82,7 @@ TEST(OpMinimumOutKernelTest, BothScalarTensors) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, {1.2}));
 }
 
-TEST(OpMinimumOutKernelTest, LeftScalarTensor) {
+TEST_F(OpMinimumOutTest, LeftScalarTensor) {
   // Checks the case where one of the tensor is a singleton tensor.
 
   TensorFactory<ScalarType::Float> tf;
@@ -101,25 +103,25 @@ TEST(OpMinimumOutKernelTest, LeftScalarTensor) {
   EXPECT_TENSOR_EQ(out2, tf.make(sizes_2, {1.0, -1.0, 0.0, 1.0}));
 }
 
-TEST(OpMinimumOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpMinimumOutTest, MismatchedInputShapesDies) {
   // First and second argument have different shape
   TensorFactory<ScalarType::Float> tf;
   Tensor out = tf.zeros({2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_minimum_out(tf.ones({2, 2}), tf.ones({3, 3}), out));
+      context_, op_minimum_out(tf.ones({2, 2}), tf.ones({3, 3}), out));
 }
 
-TEST(OpMinimumOutKernelTest, MismatchedOutputShapesDies) {
+TEST_F(OpMinimumOutTest, MismatchedOutputShapesDies) {
   // First and second argument have same shape, but output has different shape.
   TensorFactory<ScalarType::Float> tf;
   Tensor out = tf.zeros({3, 3});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_minimum_out(tf.ones({2, 2}), tf.ones({3, 3}), out));
+      context_, op_minimum_out(tf.ones({2, 2}), tf.ones({3, 3}), out));
 }
 
-TEST(OpMinimumOutKernelTest, MismatchedOutputShapeWithSingletonDies) {
+TEST_F(OpMinimumOutTest, MismatchedOutputShapeWithSingletonDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched output shape";
   }
@@ -128,7 +130,7 @@ TEST(OpMinimumOutKernelTest, MismatchedOutputShapeWithSingletonDies) {
   Tensor out = tf.zeros({4, 4});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_minimum_out(tf.ones({1, 1}), tf.ones({3, 3}), out));
+      context_, op_minimum_out(tf.ones({1, 1}), tf.ones({3, 3}), out));
 }
 
 /* %python
@@ -141,7 +143,7 @@ op = "op_minimum_out"
 dtype = "ScalarType::Float"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpMinimumOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpMinimumOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(binary_op) */
@@ -179,7 +181,7 @@ TEST(OpMinimumOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpMinimumOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpMinimumOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -220,7 +222,7 @@ TEST(OpMinimumOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpMinimumOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpMinimumOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_mm_test.cpp b/kernels/test/op_mm_test.cpp
index 6e8fa96500..70d4b5ff0f 100644
--- a/kernels/test/op_mm_test.cpp
+++ b/kernels/test/op_mm_test.cpp
@@ -25,19 +25,40 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_mm_out(const Tensor& self, const Tensor& mat2, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::mm_outf(context, self, mat2, out);
-}
-
-class OpMmOutTest : public ::testing::Test {
+class OpMmOutTest : public OperatorTest {
  protected:
-  void SetUp() override {
-    torch::executor::runtime_init();
+  Tensor& op_mm_out(const Tensor& self, const Tensor& mat2, Tensor& out) {
+    return torch::executor::aten::mm_outf(context_, self, mat2, out);
+  }
+
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+      if (DTYPE == ScalarType::Half) {
+        GTEST_SKIP()
+            << "skip Half because torch::executor::aten::mm_out does not support Half";
+        return;
+      }
+    }
+
+    // matmul gives 4 * 2 * 3 = 24
+    Tensor x = tf.full({3, 4}, 2);
+    Tensor y = tf.full({4, 5}, 3);
+
+    // Output shape should be (3, 5)
+    Tensor out = tf.zeros({3, 5});
+
+    op_mm_out(x, y, out);
+
+    Tensor expected = tf.full({3, 5}, 24);
+
+    EXPECT_TENSOR_EQ(out, expected);
   }
 };
 
-TEST(OpMmOutTest, OutputDim) {
+TEST_F(OpMmOutTest, OutputDim) {
   TensorFactory<ScalarType::Int> tf;
 
   // 3 tensors with compatible dimensions: (3, 5), (3, 4) and (4, 5).
@@ -58,33 +79,7 @@ TEST(OpMmOutTest, OutputDim) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
-    if (DTYPE == ScalarType::Half) {
-      GTEST_SKIP()
-          << "skip Half because torch::executor::aten::mm_out does not support Half";
-      return;
-    }
-  }
-
-  // matmul gives 4 * 2 * 3 = 24
-  Tensor x = tf.full({3, 4}, 2);
-  Tensor y = tf.full({4, 5}, 3);
-
-  // Output shape should be (3, 5)
-  Tensor out = tf.zeros({3, 5});
-
-  op_mm_out(x, y, out);
-
-  Tensor expected = tf.full({3, 5}, 24);
-
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpMmOutTest, AllDtypesSupported) {
+TEST_F(OpMmOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Half, TEST_ENTRY);
 #undef TEST_ENTRY
@@ -93,7 +88,7 @@ TEST(OpMmOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpMmOutTest, EmptyInputWithEmptyOutTensorPasses) {
+TEST_F(OpMmOutTest, EmptyInputWithEmptyOutTensorPasses) {
   TensorFactory<ScalarType::Float> tf;
 
   // Empty input matrices
@@ -108,7 +103,7 @@ TEST(OpMmOutTest, EmptyInputWithEmptyOutTensorPasses) {
   EXPECT_TENSOR_EQ(op_mm_out(x, y, out), expected);
 }
 
-TEST(OpMmOutTest, InfinityTensorPasses) {
+TEST_F(OpMmOutTest, InfinityTensorPasses) {
   TensorFactory<ScalarType::Float> tff;
 
   Tensor x = tff.full({3, 4}, std::numeric_limits<float>::infinity());
@@ -122,7 +117,7 @@ TEST(OpMmOutTest, InfinityTensorPasses) {
   EXPECT_TENSOR_EQ(op_mm_out(x, y, out), expected);
 }
 
-TEST(OpMmOutTest, MismatchedDimensionsDies) {
+TEST_F(OpMmOutTest, MismatchedDimensionsDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.full({2, 2}, 3);
@@ -134,12 +129,12 @@ TEST(OpMmOutTest, MismatchedDimensionsDies) {
   Tensor out = tf.full({2, 2}, 0);
 
   Tensor expected = tf.full({2, 2}, 6);
-  ET_EXPECT_KERNEL_FAILURE(op_mm_out(x, wrong_y, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_mm_out(x, wrong_y, out));
 
   EXPECT_TENSOR_EQ(op_mm_out(x, right_y, out), expected);
 }
 
-TEST(OpMmOutTest, MismatchedDimensionSizeDies) {
+TEST_F(OpMmOutTest, MismatchedDimensionSizeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched dimension size";
   }
@@ -154,11 +149,11 @@ TEST(OpMmOutTest, MismatchedDimensionSizeDies) {
   Tensor right_out = tf.ones({2, 2});
   Tensor wrong_out = tf.ones({2, 2, 3});
 
-  ET_EXPECT_KERNEL_FAILURE(op_mm_out(x, right_y, wrong_out));
-  ET_EXPECT_KERNEL_FAILURE(op_mm_out(x, wrong_y, right_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_mm_out(x, right_y, wrong_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_mm_out(x, wrong_y, right_out));
 }
 
-TEST(OpMmOutTest, WrongOutShapeDies) {
+TEST_F(OpMmOutTest, WrongOutShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle wrong out shape";
   }
@@ -171,12 +166,12 @@ TEST(OpMmOutTest, WrongOutShapeDies) {
   Tensor right_out = tf.ones({10, 4});
   Tensor wrong_out = tf.ones({7, 5});
 
-  ET_EXPECT_KERNEL_FAILURE(op_mm_out(x, y, wrong_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_mm_out(x, y, wrong_out));
 
   EXPECT_TENSOR_EQ(op_mm_out(x, y, right_out), tf.full({10, 4}, 3));
 }
 
-TEST(OpMmOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpMmOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -218,7 +213,7 @@ TEST(OpMmOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMmOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpMmOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -260,7 +255,7 @@ TEST(OpMmOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMmOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpMmOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
index e890dbbd3e..029a74ca94 100644
--- a/kernels/test/op_mul_test.cpp
+++ b/kernels/test/op_mul_test.cpp
@@ -23,121 +23,125 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_mul_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::mul_outf(context, self, other, out);
-}
-
-Tensor&
-op_mul_scalar_out(const Tensor& self, const Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::mul_outf(context, self, other, out);
-}
-
-//
-// Correctness Tests
-//
+class OpMulOutTest : public OperatorTest {
+ protected:
+  Tensor& op_mul_out(const Tensor& self, const Tensor& other, Tensor& out) {
+    return torch::executor::aten::mul_outf(context_, self, other, out);
+  }
 
-// Common testing for multipling two integer Tensors
-template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
-void test_mul() {
-  TensorFactory<DTYPE_A> tf_a;
-  TensorFactory<DTYPE_B> tf_b;
-  TensorFactory<DTYPE_OUT> tf_out;
+  // Common testing for multipling two integer Tensors
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
+  void test_mul() {
+    TensorFactory<DTYPE_A> tf_a;
+    TensorFactory<DTYPE_B> tf_b;
+    TensorFactory<DTYPE_OUT> tf_out;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the mul.
-  Tensor out = tf_out.zeros(sizes);
+    // Destination for the mul.
+    Tensor out = tf_out.zeros(sizes);
 
-  // Multiply two tensors
-  op_mul_out(tf_a.make(sizes, /*data=*/{1, 2, 4, 8}), tf_b.ones(sizes), out);
-  EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{1, 2, 4, 8}));
+    // Multiply two tensors
+    op_mul_out(tf_a.make(sizes, /*data=*/{1, 2, 4, 8}), tf_b.ones(sizes), out);
+    EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{1, 2, 4, 8}));
 
-  op_mul_out(tf_a.make(sizes, /*data=*/{1, 2, 4, 8}), tf_b.zeros(sizes), out);
-  EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{0, 0, 0, 0}));
+    op_mul_out(tf_a.make(sizes, /*data=*/{1, 2, 4, 8}), tf_b.zeros(sizes), out);
+    EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{0, 0, 0, 0}));
 
-  op_mul_out(
-      tf_a.make(sizes, /*data=*/{1, 2, 4, 8}),
-      tf_b.make(sizes, /*data=*/{1, 2, 4, 8}),
-      out);
-  EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{1, 4, 16, 64}));
-}
+    op_mul_out(
+        tf_a.make(sizes, /*data=*/{1, 2, 4, 8}),
+        tf_b.make(sizes, /*data=*/{1, 2, 4, 8}),
+        out);
+    EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{1, 4, 16, 64}));
+  }
 
-template <ScalarType DTYPE_A, ScalarType DTYPE_B>
-void test_mul_enumerate_out_types() {
-  test_mul<DTYPE_A, DTYPE_B, ScalarType::Half>();
-  test_mul<DTYPE_A, DTYPE_B, ScalarType::Float>();
-  test_mul<DTYPE_A, DTYPE_B, ScalarType::Double>();
-  // Integral out type is only allowed if both inputs are integral types
-  if (isIntegralType(DTYPE_A, false) && isIntegralType(DTYPE_B, false)) {
-    test_mul<DTYPE_A, DTYPE_B, ScalarType::Int>();
-    test_mul<DTYPE_A, DTYPE_B, ScalarType::Long>();
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B>
+  void test_mul_enumerate_out_types() {
+    test_mul<DTYPE_A, DTYPE_B, ScalarType::Half>();
+    test_mul<DTYPE_A, DTYPE_B, ScalarType::Float>();
+    test_mul<DTYPE_A, DTYPE_B, ScalarType::Double>();
+    // Integral out type is only allowed if both inputs are integral types
+    if (isIntegralType(DTYPE_A, false) && isIntegralType(DTYPE_B, false)) {
+      test_mul<DTYPE_A, DTYPE_B, ScalarType::Int>();
+      test_mul<DTYPE_A, DTYPE_B, ScalarType::Long>();
+    }
   }
-}
 
-template <ScalarType DTYPE_A>
-void test_mul_enumerate_b_types() {
+  template <ScalarType DTYPE_A>
+  void test_mul_enumerate_b_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_mul_enumerate_out_types<DTYPE_A, ScalarType::dtype>();
 
-  ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
 
-void test_mul_enumerate_a_types() {
-#define ENUMERATE_TEST_ENTRY(ctype, dtype) \
-  test_mul_enumerate_b_types<ScalarType::dtype>();
+  // Common testing for multipling two floating point Tensors
+  template <ScalarType DTYPE>
+  void test_floating_point_mul_out() {
+    TensorFactory<DTYPE> tf;
 
-  ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    const std::vector<int32_t> sizes = {2, 2};
 
-#undef ENUMERATE_TEST_ENTRY
-}
+    // Destination for the mul.
+    Tensor out = tf.zeros(sizes);
 
-/**
- * Uses the function templates above to test all valid combinations of inputs
- * and output dtypes
- */
-TEST(OpMulOutKernelTest, AllRealDtypesSupported) {
-  test_mul_enumerate_a_types();
-}
+    // Multiply two tensors
+    op_mul_out(
+        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes), out);
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}));
 
-// Common testing for multipling two floating point Tensors
-template <ScalarType DTYPE>
-void test_floating_point_mul_out() {
-  TensorFactory<DTYPE> tf;
+    op_mul_out(
+        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.zeros(sizes), out);
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.0, 0.0, 0.0, 0.0}));
 
-  const std::vector<int32_t> sizes = {2, 2};
+    op_mul_out(
+        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        out);
+    EXPECT_TENSOR_CLOSE(
+        out, tf.make(sizes, /*data=*/{1.21, 4.84, 19.36, 77.44}));
+  }
 
-  // Destination for the mul.
-  Tensor out = tf.zeros(sizes);
+  void test_mul_enumerate_a_types() {
+#define ENUMERATE_TEST_ENTRY(ctype, dtype) \
+  test_mul_enumerate_b_types<ScalarType::dtype>();
 
-  // Multiply two tensors
-  op_mul_out(
-      tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes), out);
-  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}));
+    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
 
-  op_mul_out(
-      tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.zeros(sizes), out);
-  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.0, 0.0, 0.0, 0.0}));
+#undef ENUMERATE_TEST_ENTRY
+  }
+};
 
-  op_mul_out(
-      tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
-      tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
-      out);
-  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.21, 4.84, 19.36, 77.44}));
+class OpMulScalarOutTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_mul_scalar_out(const Tensor& self, const Scalar& other, Tensor& out) {
+    return torch::executor::aten::mul_outf(context_, self, other, out);
+  }
+};
+
+//
+// Correctness Tests
+//
+
+/**
+ * Uses the function templates above to test all valid combinations of
+ * inputs*and output dtypes*/
+TEST_F(OpMulOutTest, AllRealDtypesSupported) {
+  test_mul_enumerate_a_types();
 }
 
-TEST(OpMulOutKernelTest, FloatTensors) {
+TEST_F(OpMulOutTest, FloatTensors) {
   test_floating_point_mul_out<ScalarType::Float>();
 }
 
-TEST(OpMulOutKernelTest, DoubleTensors) {
+TEST_F(OpMulOutTest, DoubleTensors) {
   test_floating_point_mul_out<ScalarType::Double>();
 }
 
-TEST(OpMulOutKernelTest, BoolTensors) {
+TEST_F(OpMulOutTest, BoolTensors) {
   TensorFactory<ScalarType::Bool> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -162,7 +166,7 @@ TEST(OpMulOutKernelTest, BoolTensors) {
 }
 
 // Mismatched shape tests.
-TEST(OpMulOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpMulOutTest, MismatchedInputShapesDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen currently supports mismatched shapes";
   }
@@ -178,11 +182,11 @@ TEST(OpMulOutKernelTest, MismatchedInputShapesDies) {
 
   // Multiplying the two mismatched tensors should cause an assertion and kill
   // the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_mul_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_mul_out(a, b, out));
 }
 
 // Broadcast tensor b's size to tensor a's size
-TEST(OpMulOutKernelTest, BroadcastA2BTest) {
+TEST_F(OpMulOutTest, BroadcastA2BTest) {
   TensorFactory<ScalarType::Int> tf_a;
 
   // a and b of different shapes
@@ -198,7 +202,7 @@ TEST(OpMulOutKernelTest, BroadcastA2BTest) {
 }
 
 // Broadcast tensor a's size to tensor b's size
-TEST(OpMulOutKernelTest, BroadcastB2ATest) {
+TEST_F(OpMulOutTest, BroadcastB2ATest) {
   TensorFactory<ScalarType::Int> tf_a;
 
   // a and b of different shapes
@@ -214,7 +218,7 @@ TEST(OpMulOutKernelTest, BroadcastB2ATest) {
 }
 
 // Broadcast tensor a and b's size to a new size c.
-TEST(OpMulOutKernelTest, BroadcastAB2CTest) {
+TEST_F(OpMulOutTest, BroadcastAB2CTest) {
   TensorFactory<ScalarType::Int> tf_a;
 
   // a and b of different shapes
@@ -230,7 +234,7 @@ TEST(OpMulOutKernelTest, BroadcastAB2CTest) {
       tf_a.make({2, 2, 2}, /*data=*/{1, 2, 2, 4, 3, 4, 6, 8}));
 }
 
-TEST(OpMaskedFillTest, ScalarInputBroadcastTest) {
+TEST_F(OpMulOutTest, ScalarInputBroadcastTest) {
   TensorFactory<ScalarType::Int> tf_a;
 
   // a is a 1d tensor and b is a scalar
@@ -245,7 +249,7 @@ TEST(OpMaskedFillTest, ScalarInputBroadcastTest) {
   EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected);
 }
 
-TEST(OpMulOutKernelTest, MismatchedOutputShapesDies) {
+TEST_F(OpMulOutTest, MismatchedOutputShapesDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen currently supports mismatched shapes";
   }
@@ -263,10 +267,10 @@ TEST(OpMulOutKernelTest, MismatchedOutputShapesDies) {
 
   // Multiplying the tensors into a mismatched output should cause an assertion
   // and kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_mul_out(a, b, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_mul_out(a, b, out));
 }
 
-TEST(OpMulOutKernelTest, BroadcastDimSizeIsOneAB) {
+TEST_F(OpMulOutTest, BroadcastDimSizeIsOneAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -292,7 +296,7 @@ TEST(OpMulOutKernelTest, BroadcastDimSizeIsOneAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMulOutKernelTest, BroadcastDimSizeMissingAB) {
+TEST_F(OpMulOutTest, BroadcastDimSizeMissingAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -318,7 +322,7 @@ TEST(OpMulOutKernelTest, BroadcastDimSizeMissingAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMulOutKernelTest, BroadcastDimSizeIsOneBA) {
+TEST_F(OpMulOutTest, BroadcastDimSizeIsOneBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.9711773991584778, 0.8632034063339233});
@@ -344,7 +348,7 @@ TEST(OpMulOutKernelTest, BroadcastDimSizeIsOneBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMulOutKernelTest, BroadcastDimSizeMissingBA) {
+TEST_F(OpMulOutTest, BroadcastDimSizeMissingBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.9711773991584778, 0.8632034063339233});
@@ -370,7 +374,7 @@ TEST(OpMulOutKernelTest, BroadcastDimSizeMissingBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMulOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpMulOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -404,7 +408,7 @@ TEST(OpMulOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMulOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpMulOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -438,7 +442,7 @@ TEST(OpMulOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMulOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpMulOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -473,7 +477,7 @@ TEST(OpMulOutKernelTest, DynamicShapeUnbound) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpMulScalarOutKernelTest, SanityCheck) {
+TEST_F(OpMulScalarOutTest, SanityCheck) {
   TensorFactory<ScalarType::Bool> tf_a;
   TensorFactory<ScalarType::Float> tf_out;
 
@@ -487,7 +491,7 @@ TEST(OpMulScalarOutKernelTest, SanityCheck) {
   EXPECT_TENSOR_EQ(out, tf_out.make(sizes, {2.3, 0.0, 2.3, 0.0}));
 }
 
-TEST(OpMulScalarOutKernelTest, OptimizedSanityCheck) {
+TEST_F(OpMulScalarOutTest, OptimizedSanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
diff --git a/kernels/test/op_native_batch_norm_test.cpp b/kernels/test/op_native_batch_norm_test.cpp
index 10ad74dd88..459a5e5a55 100644
--- a/kernels/test/op_native_batch_norm_test.cpp
+++ b/kernels/test/op_native_batch_norm_test.cpp
@@ -17,63 +17,68 @@
 
 using namespace ::testing;
 
-::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&, exec_aten::Tensor&>
-op_native_batch_norm_legit_no_training_out(
-    const exec_aten::Tensor& input,
-    const exec_aten::optional<exec_aten::Tensor>& weight,
-    const exec_aten::optional<exec_aten::Tensor>& bias,
-    const exec_aten::Tensor& running_mean,
-    const exec_aten::Tensor& running_var,
-    double momentum,
-    double eps,
-    exec_aten::Tensor& out0,
-    exec_aten::Tensor& out1,
-    exec_aten::Tensor& out2) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::_native_batch_norm_legit_no_training_outf(
-      context,
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      momentum,
-      eps,
-      out0,
-      out1,
-      out2);
-}
+class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
+ protected:
+  ::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&, exec_aten::Tensor&>
+  op_native_batch_norm_legit_no_training_out(
+      const exec_aten::Tensor& input,
+      const exec_aten::optional<exec_aten::Tensor>& weight,
+      const exec_aten::optional<exec_aten::Tensor>& bias,
+      const exec_aten::Tensor& running_mean,
+      const exec_aten::Tensor& running_var,
+      double momentum,
+      double eps,
+      exec_aten::Tensor& out0,
+      exec_aten::Tensor& out1,
+      exec_aten::Tensor& out2) {
+    return torch::executor::aten::_native_batch_norm_legit_no_training_outf(
+        context_,
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        momentum,
+        eps,
+        out0,
+        out1,
+        out2);
+  }
+};
 
-::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&, exec_aten::Tensor&>
-op_native_batch_norm_legit_out(
-    const exec_aten::Tensor& input,
-    const exec_aten::optional<exec_aten::Tensor>& weight,
-    const exec_aten::optional<exec_aten::Tensor>& bias,
-    exec_aten::Tensor& running_mean,
-    exec_aten::Tensor& running_var,
-    bool training,
-    double momentum,
-    double eps,
-    exec_aten::Tensor& out0,
-    exec_aten::Tensor& out1,
-    exec_aten::Tensor& out2) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::_native_batch_norm_legit_outf(
-      context,
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      training,
-      momentum,
-      eps,
-      out0,
-      out1,
-      out2);
-}
+class OpNativeBatchNormLegitOutTest : public OperatorTest {
+ protected:
+  ::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&, exec_aten::Tensor&>
+  op_native_batch_norm_legit_out(
+      const exec_aten::Tensor& input,
+      const exec_aten::optional<exec_aten::Tensor>& weight,
+      const exec_aten::optional<exec_aten::Tensor>& bias,
+      exec_aten::Tensor& running_mean,
+      exec_aten::Tensor& running_var,
+      bool training,
+      double momentum,
+      double eps,
+      exec_aten::Tensor& out0,
+      exec_aten::Tensor& out1,
+      exec_aten::Tensor& out2) {
+    exec_aten::RuntimeContext context{};
+    return torch::executor::aten::_native_batch_norm_legit_outf(
+        context,
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        training,
+        momentum,
+        eps,
+        out0,
+        out1,
+        out2);
+  }
+};
 
-TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest2D) {
+TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest2D) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor input = tfFloat.make(
@@ -159,7 +164,7 @@ TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest2D) {
   EXPECT_TENSOR_CLOSE(out2, out2_expected);
 }
 
-TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
+TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor input = tfFloat.make(
@@ -319,7 +324,7 @@ TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
   EXPECT_TENSOR_CLOSE(out2, out2_expected);
 }
 
-TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest4D) {
+TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest4D) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor input = tfFloat.make(
@@ -509,7 +514,7 @@ TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest4D) {
   EXPECT_TENSOR_CLOSE(out2, out2_expected);
 }
 
-TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestDouble) {
+TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestDouble) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Double>
       tfDouble;
 
@@ -638,7 +643,7 @@ TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestDouble) {
   EXPECT_TENSOR_CLOSE(out2, out2_expected);
 }
 
-TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
+TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor input = tfFloat.make(
@@ -789,7 +794,9 @@ TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
   EXPECT_TENSOR_CLOSE(out2, out2_expected);
 }
 
-TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeightNoBias) {
+TEST_F(
+    OpNativeBatchNormLegitNoTrainingOutTest,
+    SampleAtomicTestNoWeightNoBias) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor input = tfFloat.make(
@@ -855,7 +862,7 @@ TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeightNoBias) {
   EXPECT_TENSOR_CLOSE(out2, out2_expected);
 }
 
-TEST(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
+TEST_F(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor input = tfFloat.make(
diff --git a/kernels/test/op_native_layer_norm_test.cpp b/kernels/test/op_native_layer_norm_test.cpp
index 2111103714..fd1ca982d5 100644
--- a/kernels/test/op_native_layer_norm_test.cpp
+++ b/kernels/test/op_native_layer_norm_test.cpp
@@ -32,19 +32,351 @@ using torch::executor::testing::TensorFactory;
 
 using OptScalar = exec_aten::optional<Scalar>;
 
-::std::tuple<Tensor&, Tensor&, Tensor&> op_native_layer_norm_out(
-    const Tensor& input,
-    IntArrayRef normalized_shape,
-    const optional<Tensor>& weight,
-    const optional<Tensor>& bias,
-    double eps,
-    Tensor& out0,
-    Tensor& out1,
-    Tensor& out2) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::native_layer_norm_outf(
-      context, input, normalized_shape, weight, bias, eps, out0, out1, out2);
-}
+class OpNativeLayerNormTest : public OperatorTest {
+ protected:
+  ::std::tuple<Tensor&, Tensor&, Tensor&> op_native_layer_norm_out(
+      const Tensor& input,
+      IntArrayRef normalized_shape,
+      const optional<Tensor>& weight,
+      const optional<Tensor>& bias,
+      double eps,
+      Tensor& out0,
+      Tensor& out1,
+      Tensor& out2) {
+    return torch::executor::aten::native_layer_norm_outf(
+        context_, input, normalized_shape, weight, bias, eps, out0, out1, out2);
+  }
+
+  template <ScalarType DTYPE>
+  struct NativeLayerNormTestCase {
+    using ctype = typename TensorFactory<DTYPE>::ctype;
+
+    // Human-readable, unique title for the test case. Printed if the test
+    // fails.
+    const std::string title;
+    // Size vector for the input/output
+    const std::vector<int32_t> sizes;
+    // Data for the input tensor; must agree with `sizes`.
+    const std::vector<ctype> input_data;
+    // The normalized shape. Only the last dim is accepted.
+    const std::vector<int32_t> normalized_shape;
+    // Affine transform weight.
+    const std::vector<ctype> weight_data;
+    // Affine transform bias.
+    const std::vector<ctype> bias_data;
+    // a value added to the denominator for numerical stability
+    const ctype eps;
+    // The expected output data.
+    const std::vector<ctype> expected_data;
+  };
+
+  /// Runs the provided test cases.
+  template <ScalarType DTYPE>
+  void run_test_cases(std::vector<NativeLayerNormTestCase<DTYPE>> test_cases) {
+    TensorFactory<DTYPE> tf;
+    for (const auto& test_case : test_cases) {
+      SCOPED_TRACE(test_case.title); // Printed if the test fails
+
+      Tensor in = tf.make(test_case.sizes, test_case.input_data);
+      Tensor weight =
+          tf.make(test_case.normalized_shape, test_case.weight_data);
+      Tensor bias = tf.make(test_case.normalized_shape, test_case.bias_data);
+      Tensor out0 = tf.zeros(test_case.sizes);
+      Tensor out1 = tf.zeros(
+          test_case.sizes, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+      Tensor out2 = tf.zeros(
+          test_case.sizes, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+      auto normalized_shape_vec = std::vector<int64_t>(
+          test_case.normalized_shape.begin(), test_case.normalized_shape.end());
+      auto normalized_shape = exec_aten::ArrayRef<int64_t>(
+          normalized_shape_vec.data(), normalized_shape_vec.size());
+      auto result = op_native_layer_norm_out(
+          in, normalized_shape, weight, bias, test_case.eps, out0, out1, out2);
+      EXPECT_TENSOR_CLOSE(out0, std::get<0>(result));
+
+      Tensor expected = tf.make(test_case.sizes, test_case.expected_data);
+      EXPECT_TENSOR_CLOSE(out0, expected);
+    }
+  }
+
+  // Test cases that are compatible with float and double.
+  template <ScalarType DTYPE>
+  void run_floating_point_test_cases() {
+    constexpr auto kInfinity =
+        std::numeric_limits<typename TensorFactory<DTYPE>::ctype>::infinity();
+    // Reference colab note:
+    // https://colab.research.google.com/drive/1KZT6sEY-h7lwZlwBanbLl77M5OuzzsZI#scrollTo=18WtUPCXYCPx
+    std::vector<NativeLayerNormTestCase<DTYPE>> test_cases = {
+        {
+            std::string(__func__) + ": Simple negative/positive layer norm",
+            {2, 3}, // sizes
+            {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {3}, // normalized shape
+            {1.0, 1.0, 1.0}, // weights
+            {0.0, 0.0, 0.0}, // bias
+            1.0e-5, // eps
+            {1.22474,
+             0.0000,
+             -1.22474,
+             -0.925819,
+             1.38873,
+             -0.46291}, // expected_data
+        },
+        {
+            std::string(__func__) + ": non-default eps",
+            {2, 3}, // sizes
+            {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {3}, // normalized shape
+            {1.0, 1.0, 1.0}, // weights
+            {0.0, 0.0, 0.0}, // bias
+            1.0e-3, // eps
+            {1.22383,
+             0,
+             -1.22383,
+             -0.925721,
+             1.38858,
+             -0.46286}, // expected_data
+        },
+        {
+            std::string(__func__) + ": non-default weights",
+            {2, 3}, // sizes
+            {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {3}, // normalized shape
+            {2.0, 2.0, 2.0}, // weights
+            {0.0, 0.0, 0.0}, // bias
+            1.0e-5, // eps
+            {2.44947,
+             0,
+             -2.44947,
+             -1.85164,
+             2.77746,
+             -0.925819}, // expected_data
+        },
+        {
+            std::string(__func__) + ": non-default bias",
+            {2, 3}, // sizes
+            {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {3}, // normalized shape
+            {1.0, 1.0, 1.0}, // weights
+            {1.0, 1.0, 1.0}, // bias
+            1.0e-5, // eps
+            {2.22474,
+             1,
+             -0.224736,
+             0.0741809,
+             2.38873,
+             0.53709}, // expected_data
+        },
+        {
+            std::string(__func__) + ": infinite input brings NAN results",
+            {2, 3}, // sizes
+            {kInfinity, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {3}, // normalized shape
+            {1.0, 1.0, 1.0}, // weights
+            {1.0, 1.0, 1.0}, // bias
+            1.0e-5, // eps
+            {-NAN, -NAN, -NAN, 0.0741809, 2.38873, 0.53709}, // expected_data
+        },
+        {
+            std::string(__func__) + ": NAN input brings NAN results",
+            {2, 3}, // sizes
+            {NAN, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {3}, // normalized shape
+            {1.0, 1.0, 1.0}, // weights
+            {1.0, 1.0, 1.0}, // bias
+            1.0e-5, // eps
+            {-NAN, -NAN, -NAN, 0.0741809, 2.38873, 0.53709}, // expected_data
+        },
+        {
+            std::string(__func__) + ": NAN weight brings NAN results",
+            {2, 3}, // sizes
+            {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {3}, // normalized shape
+            {NAN, 1.0, 1.0}, // weights
+            {1.0, 1.0, 1.0}, // bias
+            1.0e-5, // eps
+            {NAN, 1, -0.224736, NAN, 2.38873, 0.53709}, // expected_data
+        },
+        {
+            std::string(__func__) + ": inf weight brings inf results",
+            {2, 3}, // sizes
+            {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {3}, // normalized shape
+            {kInfinity, 1.0, 1.0}, // weights
+            {0.0, 0.0, 0.0}, // bias
+            1.0e-5, // eps
+            {kInfinity,
+             0,
+             -1.22474,
+             -kInfinity,
+             1.38873,
+             -0.46291}, // expected_data
+        },
+        {
+            std::string(__func__) + ": inf bias brings inf results",
+            {2, 3}, // sizes
+            {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {3}, // normalized shape
+            {kInfinity, 1.0, 1.0}, // weights
+            {0.0, 0.0, 0.0}, // bias
+            1.0e-5, // eps
+            {kInfinity,
+             0,
+             -1.22474,
+             -kInfinity,
+             1.38873,
+             -0.46291}, // expected_data
+        },
+    };
+
+    run_test_cases(test_cases);
+  }
+
+  // Runs death test cases.
+  template <ScalarType DTYPE>
+  void run_death_test_cases(
+      std::vector<NativeLayerNormTestCase<DTYPE>> test_cases) {
+    TensorFactory<DTYPE> tf;
+    for (const auto& test_case : test_cases) {
+      SCOPED_TRACE(test_case.title); // Printed if the test fails
+
+      Tensor in = tf.make(test_case.sizes, test_case.input_data);
+      exec_aten::optional<Tensor> weight, bias;
+      if (!test_case.weight_data.empty()) {
+        weight = tf.make(test_case.normalized_shape, test_case.weight_data);
+      }
+      if (!test_case.bias_data.empty()) {
+        bias = tf.make(test_case.normalized_shape, test_case.bias_data);
+      }
+      Tensor out0 = tf.zeros(test_case.sizes);
+      Tensor out1 = tf.zeros(test_case.sizes);
+      Tensor out2 = tf.zeros(test_case.sizes);
+      auto normalized_shape_vec = std::vector<int64_t>(
+          test_case.normalized_shape.begin(), test_case.normalized_shape.end());
+      auto normalized_shape = exec_aten::ArrayRef<int64_t>(
+          normalized_shape_vec.data(), normalized_shape_vec.size());
+      ET_EXPECT_KERNEL_FAILURE(
+          context_,
+          op_native_layer_norm_out(
+              in,
+              normalized_shape,
+              weight,
+              bias,
+              test_case.eps,
+              out0,
+              out1,
+              out2));
+    }
+  }
+
+  // Test cases with imcompatible types.
+  template <ScalarType DTYPE>
+  void run_int_test_cases() {
+    std::vector<NativeLayerNormTestCase<DTYPE>> test_cases = {
+        {
+            std::string(__func__) + ": Simple negative/positive layer norm",
+            // Cannot be represented by a type other than float.
+            {2, 3}, // sizes
+            {1, 0, -1, -1, 4, 0}, // input_data
+            {3}, // normalized shape
+            {1, 1, 1}, // weights
+            {0, 0, 0}, // bias
+            1, // eps
+            {0, 0, 0, 0, 0, 0}, // expected_data
+        },
+    };
+    run_death_test_cases(test_cases);
+  }
+
+  // Test cases with wrong normalized shape.
+  template <ScalarType DTYPE>
+  void run_wrong_shape_test_cases() {
+    std::vector<NativeLayerNormTestCase<DTYPE>> test_cases = {
+        {
+            std::string(__func__) + ": Test with wrong normalized shape",
+            {2, 3}, // sizes
+            {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
+            {1}, // wrong normalized shape
+            {1.0}, // weights
+            {0.0}, // bias
+            1.0e-5, // eps
+            {1.22474,
+             0.0000,
+             -1.22474,
+             -0.925819,
+             1.38873,
+             -0.46291}, // expected_data
+        },
+    };
+    run_death_test_cases(test_cases);
+  }
+
+  /* %python
+  import torch
+  torch.manual_seed(0)
+
+  input = torch.rand(2, 3)
+  normalized_shape = [3]
+  weight = torch.tensor([1.0, 1.0, 1.0])
+  bias = torch.tensor([0.0, 0.0, 0.0])
+  eps = 1e-05
+  expected = torch.nn.functional.layer_norm(
+    input, normalized_shape, weight=weight, bias=bias, eps=eps)
+
+  native_layer_norm_template = f"""
+    {declare_tensor_factory("ScalarType::Float", "tf")}
+
+    {declare_tensor_make_t("input", "tf")}
+    {declare_optional_tensor_make_t("weight", "tf")}
+    {declare_optional_tensor_make_t("bias", "tf")}
+    {declare_tensor_make_t("expected", "tf")}
+    {declare_tensor_zeros("out_shape, dynamism", "tf", "out0")}
+    {declare_tensor_zeros("out_shape, dynamism", "tf", "out1")}
+    {declare_tensor_zeros("out_shape, dynamism", "tf", "out2")}
+
+    int64_t normalized_shape[] = $normalized_shape$;
+
+    op_native_layer_norm_out(
+      input, normalized_shape, weight, bias, $eps$, out0, out1, out2);
+    EXPECT_TENSOR_CLOSE(out0, expected);""" */
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(native_layer_norm_template) */
+
+    TensorFactory<ScalarType::Float> tf;
+
+    Tensor input = tf.make(
+        {2, 3},
+        {0.49625658988952637,
+         0.7682217955589294,
+         0.08847743272781372,
+         0.13203048706054688,
+         0.30742281675338745,
+         0.6340786814689636});
+    optional<Tensor> weight(tf.make({3}, {1.0, 1.0, 1.0}));
+    optional<Tensor> bias(tf.make({3}, {0.0, 0.0, 0.0}));
+    Tensor expected = tf.make(
+        {2, 3},
+        {0.16205203533172607,
+         1.1355723142623901,
+         -1.2976245880126953,
+         -1.0853172540664673,
+         -0.24233698844909668,
+         1.3276543617248535});
+    Tensor out0 = tf.zeros(out_shape, dynamism);
+    Tensor out1 = tf.zeros(out_shape, dynamism);
+    Tensor out2 = tf.zeros(out_shape, dynamism);
+
+    int64_t normalized_shape[] = {3};
+
+    op_native_layer_norm_out(
+        input, normalized_shape, weight, bias, 1e-05, out0, out1, out2);
+    EXPECT_TENSOR_CLOSE(out0, expected);
+  }
+};
+
 namespace {
 std::vector<int64_t> vector_32_to_64(std::vector<int32_t> vector_32) {
   std::vector<int64_t> vector_64(vector_32.size());
@@ -58,336 +390,32 @@ std::vector<int64_t> vector_32_to_64(std::vector<int32_t> vector_32) {
 } // namespace
 
 /// Describes a test case, using tensors of the specified DTYPE.
-template <ScalarType DTYPE>
-struct NativeLayerNormTestCase {
-  using ctype = typename TensorFactory<DTYPE>::ctype;
-
-  // Human-readable, unique title for the test case. Printed if the test fails.
-  const std::string title;
-  // Size vector for the input/output
-  const std::vector<int32_t> sizes;
-  // Data for the input tensor; must agree with `sizes`.
-  const std::vector<ctype> input_data;
-  // The normalized shape. Only the last dim is accepted.
-  const std::vector<int32_t> normalized_shape;
-  // Affine transform weight.
-  const std::vector<ctype> weight_data;
-  // Affine transform bias.
-  const std::vector<ctype> bias_data;
-  // a value added to the denominator for numerical stability
-  const ctype eps;
-  // The expected output data.
-  const std::vector<ctype> expected_data;
-};
-
-/// Runs the provided test cases.
-template <ScalarType DTYPE>
-void run_test_cases(std::vector<NativeLayerNormTestCase<DTYPE>> test_cases) {
-  TensorFactory<DTYPE> tf;
-  for (const auto& test_case : test_cases) {
-    SCOPED_TRACE(test_case.title); // Printed if the test fails
-
-    Tensor in = tf.make(test_case.sizes, test_case.input_data);
-    Tensor weight = tf.make(test_case.normalized_shape, test_case.weight_data);
-    Tensor bias = tf.make(test_case.normalized_shape, test_case.bias_data);
-    Tensor out0 = tf.zeros(test_case.sizes);
-    Tensor out1 = tf.zeros(
-        test_case.sizes, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
-    Tensor out2 = tf.zeros(
-        test_case.sizes, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
-    auto normalized_shape_vec = std::vector<int64_t>(
-        test_case.normalized_shape.begin(), test_case.normalized_shape.end());
-    auto normalized_shape = exec_aten::ArrayRef<int64_t>(
-        normalized_shape_vec.data(), normalized_shape_vec.size());
-    auto result = op_native_layer_norm_out(
-        in, normalized_shape, weight, bias, test_case.eps, out0, out1, out2);
-    EXPECT_TENSOR_CLOSE(out0, std::get<0>(result));
-
-    Tensor expected = tf.make(test_case.sizes, test_case.expected_data);
-    EXPECT_TENSOR_CLOSE(out0, expected);
-  }
-}
-
-// Test cases that are compatible with float and double.
-template <ScalarType DTYPE>
-void run_floating_point_test_cases() {
-  constexpr auto kInfinity =
-      std::numeric_limits<typename TensorFactory<DTYPE>::ctype>::infinity();
-  // Reference colab note:
-  // https://colab.research.google.com/drive/1KZT6sEY-h7lwZlwBanbLl77M5OuzzsZI#scrollTo=18WtUPCXYCPx
-  std::vector<NativeLayerNormTestCase<DTYPE>> test_cases = {
-      {
-          std::string(__func__) + ": Simple negative/positive layer norm",
-          {2, 3}, // sizes
-          {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {3}, // normalized shape
-          {1.0, 1.0, 1.0}, // weights
-          {0.0, 0.0, 0.0}, // bias
-          1.0e-5, // eps
-          {1.22474,
-           0.0000,
-           -1.22474,
-           -0.925819,
-           1.38873,
-           -0.46291}, // expected_data
-      },
-      {
-          std::string(__func__) + ": non-default eps",
-          {2, 3}, // sizes
-          {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {3}, // normalized shape
-          {1.0, 1.0, 1.0}, // weights
-          {0.0, 0.0, 0.0}, // bias
-          1.0e-3, // eps
-          {1.22383, 0, -1.22383, -0.925721, 1.38858, -0.46286}, // expected_data
-      },
-      {
-          std::string(__func__) + ": non-default weights",
-          {2, 3}, // sizes
-          {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {3}, // normalized shape
-          {2.0, 2.0, 2.0}, // weights
-          {0.0, 0.0, 0.0}, // bias
-          1.0e-5, // eps
-          {2.44947, 0, -2.44947, -1.85164, 2.77746, -0.925819}, // expected_data
-      },
-      {
-          std::string(__func__) + ": non-default bias",
-          {2, 3}, // sizes
-          {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {3}, // normalized shape
-          {1.0, 1.0, 1.0}, // weights
-          {1.0, 1.0, 1.0}, // bias
-          1.0e-5, // eps
-          {2.22474, 1, -0.224736, 0.0741809, 2.38873, 0.53709}, // expected_data
-      },
-      {
-          std::string(__func__) + ": infinite input brings NAN results",
-          {2, 3}, // sizes
-          {kInfinity, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {3}, // normalized shape
-          {1.0, 1.0, 1.0}, // weights
-          {1.0, 1.0, 1.0}, // bias
-          1.0e-5, // eps
-          {-NAN, -NAN, -NAN, 0.0741809, 2.38873, 0.53709}, // expected_data
-      },
-      {
-          std::string(__func__) + ": NAN input brings NAN results",
-          {2, 3}, // sizes
-          {NAN, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {3}, // normalized shape
-          {1.0, 1.0, 1.0}, // weights
-          {1.0, 1.0, 1.0}, // bias
-          1.0e-5, // eps
-          {-NAN, -NAN, -NAN, 0.0741809, 2.38873, 0.53709}, // expected_data
-      },
-      {
-          std::string(__func__) + ": NAN weight brings NAN results",
-          {2, 3}, // sizes
-          {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {3}, // normalized shape
-          {NAN, 1.0, 1.0}, // weights
-          {1.0, 1.0, 1.0}, // bias
-          1.0e-5, // eps
-          {NAN, 1, -0.224736, NAN, 2.38873, 0.53709}, // expected_data
-      },
-      {
-          std::string(__func__) + ": inf weight brings inf results",
-          {2, 3}, // sizes
-          {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {3}, // normalized shape
-          {kInfinity, 1.0, 1.0}, // weights
-          {0.0, 0.0, 0.0}, // bias
-          1.0e-5, // eps
-          {kInfinity,
-           0,
-           -1.22474,
-           -kInfinity,
-           1.38873,
-           -0.46291}, // expected_data
-      },
-      {
-          std::string(__func__) + ": inf bias brings inf results",
-          {2, 3}, // sizes
-          {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {3}, // normalized shape
-          {kInfinity, 1.0, 1.0}, // weights
-          {0.0, 0.0, 0.0}, // bias
-          1.0e-5, // eps
-          {kInfinity,
-           0,
-           -1.22474,
-           -kInfinity,
-           1.38873,
-           -0.46291}, // expected_data
-      },
-  };
-
-  run_test_cases(test_cases);
-}
-
-/// Runs death test cases.
-template <ScalarType DTYPE>
-void run_death_test_cases(
-    std::vector<NativeLayerNormTestCase<DTYPE>> test_cases) {
-  TensorFactory<DTYPE> tf;
-  for (const auto& test_case : test_cases) {
-    SCOPED_TRACE(test_case.title); // Printed if the test fails
-
-    Tensor in = tf.make(test_case.sizes, test_case.input_data);
-    exec_aten::optional<Tensor> weight, bias;
-    if (!test_case.weight_data.empty()) {
-      weight = tf.make(test_case.normalized_shape, test_case.weight_data);
-    }
-    if (!test_case.bias_data.empty()) {
-      bias = tf.make(test_case.normalized_shape, test_case.bias_data);
-    }
-    Tensor out0 = tf.zeros(test_case.sizes);
-    Tensor out1 = tf.zeros(test_case.sizes);
-    Tensor out2 = tf.zeros(test_case.sizes);
-    auto normalized_shape_vec = std::vector<int64_t>(
-        test_case.normalized_shape.begin(), test_case.normalized_shape.end());
-    auto normalized_shape = exec_aten::ArrayRef<int64_t>(
-        normalized_shape_vec.data(), normalized_shape_vec.size());
-    ET_EXPECT_KERNEL_FAILURE(op_native_layer_norm_out(
-        in, normalized_shape, weight, bias, test_case.eps, out0, out1, out2));
-  }
-}
-
-// Test cases with imcompatible types.
-template <ScalarType DTYPE>
-void run_int_test_cases() {
-  std::vector<NativeLayerNormTestCase<DTYPE>> test_cases = {
-      {
-          std::string(__func__) + ": Simple negative/positive layer norm",
-          // Cannot be represented by a type other than float.
-          {2, 3}, // sizes
-          {1, 0, -1, -1, 4, 0}, // input_data
-          {3}, // normalized shape
-          {1, 1, 1}, // weights
-          {0, 0, 0}, // bias
-          1, // eps
-          {0, 0, 0, 0, 0, 0}, // expected_data
-      },
-  };
-  run_death_test_cases(test_cases);
-}
-
-// Test cases with wrong normalized shape.
-template <ScalarType DTYPE>
-void run_wrong_shape_test_cases() {
-  std::vector<NativeLayerNormTestCase<DTYPE>> test_cases = {
-      {
-          std::string(__func__) + ": Test with wrong normalized shape",
-          {2, 3}, // sizes
-          {1.0, 0.0, -1.0, -1.0, 4.0, 0.0}, // input_data
-          {1}, // wrong normalized shape
-          {1.0}, // weights
-          {0.0}, // bias
-          1.0e-5, // eps
-          {1.22474,
-           0.0000,
-           -1.22474,
-           -0.925819,
-           1.38873,
-           -0.46291}, // expected_data
-      },
-  };
-  run_death_test_cases(test_cases);
-}
-
-TEST(OpNativeLayerNormTest, FloatTensors) {
+TEST_F(OpNativeLayerNormTest, FloatTensors) {
   run_floating_point_test_cases<ScalarType::Float>();
   run_floating_point_test_cases<ScalarType::Double>();
 }
 
-TEST(OpNativeLayerNormTest, IntTensorsDies) {
+TEST_F(OpNativeLayerNormTest, IntTensorsDies) {
   // Cannot be represented by a type other than float.
   run_int_test_cases<ScalarType::Int>();
 }
 
-TEST(OpNativeLayerNormTest, WrongNomalizedShape) {
+TEST_F(OpNativeLayerNormTest, WrongNomalizedShape) {
   // Normalized shape does not match last dim of input.
   run_wrong_shape_test_cases<ScalarType::Float>();
 }
 
-/* %python
-import torch
-torch.manual_seed(0)
-
-input = torch.rand(2, 3)
-normalized_shape = [3]
-weight = torch.tensor([1.0, 1.0, 1.0])
-bias = torch.tensor([0.0, 0.0, 0.0])
-eps = 1e-05
-expected = torch.nn.functional.layer_norm(
-  input, normalized_shape, weight=weight, bias=bias, eps=eps)
-
-native_layer_norm_template = f"""
-  {declare_tensor_factory("ScalarType::Float", "tf")}
-
-  {declare_tensor_make_t("input", "tf")}
-  {declare_optional_tensor_make_t("weight", "tf")}
-  {declare_optional_tensor_make_t("bias", "tf")}
-  {declare_tensor_make_t("expected", "tf")}
-  {declare_tensor_zeros("out_shape, dynamism", "tf", "out0")}
-  {declare_tensor_zeros("out_shape, dynamism", "tf", "out1")}
-  {declare_tensor_zeros("out_shape, dynamism", "tf", "out2")}
-
-  int64_t normalized_shape[] = $normalized_shape$;
-
-  op_native_layer_norm_out(
-    input, normalized_shape, weight, bias, $eps$, out0, out1, out2);
-  EXPECT_TENSOR_CLOSE(out0, expected);""" */
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(native_layer_norm_template) */
-
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor input = tf.make(
-      {2, 3},
-      {0.49625658988952637,
-       0.7682217955589294,
-       0.08847743272781372,
-       0.13203048706054688,
-       0.30742281675338745,
-       0.6340786814689636});
-  optional<Tensor> weight(tf.make({3}, {1.0, 1.0, 1.0}));
-  optional<Tensor> bias(tf.make({3}, {0.0, 0.0, 0.0}));
-  Tensor expected = tf.make(
-      {2, 3},
-      {0.16205203533172607,
-       1.1355723142623901,
-       -1.2976245880126953,
-       -1.0853172540664673,
-       -0.24233698844909668,
-       1.3276543617248535});
-  Tensor out0 = tf.zeros(out_shape, dynamism);
-  Tensor out1 = tf.zeros(out_shape, dynamism);
-  Tensor out2 = tf.zeros(out_shape, dynamism);
-
-  int64_t normalized_shape[] = {3};
-
-  op_native_layer_norm_out(
-      input, normalized_shape, weight, bias, 1e-05, out0, out1, out2);
-  EXPECT_TENSOR_CLOSE(out0, expected);
-}
-
-TEST(OpNativeLayerNormTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpNativeLayerNormTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpNativeLayerNormTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpNativeLayerNormTest, DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpNativeLayerNormTest, DynamicShapeUnbound) {
+TEST_F(OpNativeLayerNormTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_ne_test.cpp b/kernels/test/op_ne_test.cpp
index 117d078ea5..9603dee03c 100644
--- a/kernels/test/op_ne_test.cpp
+++ b/kernels/test/op_ne_test.cpp
@@ -21,40 +21,73 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_ne_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::ne_outf(context, self, other, out);
-}
+class OpNeTest : public OperatorTest {
+ protected:
+  Tensor& op_ne_tensor_out(const Tensor& self, Tensor& other, Tensor& out) {
+    return torch::executor::aten::ne_outf(context_, self, other, out);
+  }
 
-Tensor& op_ne_tensor_out(const Tensor& self, Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::ne_outf(context, self, other, out);
-}
+  template <class CTYPE, ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf_input;
+    TensorFactory<ScalarType::Bool> tf_bool;
+    Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
+    Tensor b = tf_input.make({2, 2}, {2, 2, 2, 2});
+    Tensor out = tf_bool.zeros({2, 2});
+    RuntimeContext context{};
+
+    torch::executor::aten::ne_outf(context, a, b, out);
+    EXPECT_TENSOR_EQ(out, tf_bool.make({2, 2}, {false, true, false, true}));
+  }
+};
 
-// Common testing for ne operator
-template <ScalarType DTYPE>
-void test_ne_scalar_out() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<ScalarType::Bool> tf_out;
+class OpNeScalarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_ne_scalar_out(const Tensor& self, Scalar& other, Tensor& out) {
+    return torch::executor::aten::ne_outf(context_, self, other, out);
+  }
 
-  const std::vector<int32_t> sizes = {2, 2};
-  // Destination for the ne
-  Tensor out = tf_out.ones(sizes);
-  Scalar other = 2;
+  // Common testing for ne operator
+  template <ScalarType DTYPE>
+  void test_ne_scalar_out() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<ScalarType::Bool> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 2};
+    // Destination for the ne
+    Tensor out = tf_out.ones(sizes);
+    Scalar other = 2;
+
+    // Valid input should give the expected output
+    op_ne_scalar_out(tf.make(sizes, /*data=*/{2, 3, 2, 3}), other, out);
+    EXPECT_TENSOR_EQ(
+        out, tf_out.make(sizes, /*data=*/{false, true, false, true}));
+  }
 
-  // Valid input should give the expected output
-  op_ne_scalar_out(tf.make(sizes, /*data=*/{2, 3, 2, 3}), other, out);
-  EXPECT_TENSOR_EQ(
-      out, tf_out.make(sizes, /*data=*/{false, true, false, true}));
-}
+  // Handle all output dtypes.
+  template <ScalarType OUTPUT_DTYPE>
+  void test_ne_all_output_dtypes() {
+    TensorFactory<ScalarType::Float> tf_float;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf_float.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+    Scalar other = 3;
 
-TEST(OpNeScalarOutKernelTest, AllRealInputBoolOutputSupport) {
+    op_ne_scalar_out(in, other, out);
+    EXPECT_TENSOR_EQ(out, tf_out.ones(sizes));
+  }
+};
+
+TEST_F(OpNeScalarOutTest, AllRealInputBoolOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) test_ne_scalar_out<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpNeScalarOutKernelTest, BoolInputDtype) {
+TEST_F(OpNeScalarOutTest, BoolInputDtype) {
   TensorFactory<ScalarType::Bool> tf_bool;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -68,7 +101,7 @@ TEST(OpNeScalarOutKernelTest, BoolInputDtype) {
 }
 
 // Mismatched shape tests.
-TEST(OpNeScalarOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpNeScalarOutTest, MismatchedShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -79,45 +112,16 @@ TEST(OpNeScalarOutKernelTest, MismatchedShapesDies) {
   Tensor out = tf_bool.ones(/*sizes=*/{2, 2});
   Scalar other = 3;
 
-  ET_EXPECT_KERNEL_FAILURE(op_ne_scalar_out(a, other, out));
-}
-
-// Handle all output dtypes.
-template <ScalarType OUTPUT_DTYPE>
-void test_ne_all_output_dtypes() {
-  TensorFactory<ScalarType::Float> tf_float;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf_float.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-  Scalar other = 3;
-
-  op_ne_scalar_out(in, other, out);
-  EXPECT_TENSOR_EQ(out, tf_out.ones(sizes));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_ne_scalar_out(a, other, out));
 }
 
-TEST(OpNeScalarOutKernelTest, AllRealOutputDTypesSupported) {
+TEST_F(OpNeScalarOutTest, AllRealOutputDTypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_ne_all_output_dtypes<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-template <class CTYPE, ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf_input;
-  TensorFactory<ScalarType::Bool> tf_bool;
-  Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4});
-  Tensor b = tf_input.make({2, 2}, {2, 2, 2, 2});
-  Tensor out = tf_bool.zeros({2, 2});
-  RuntimeContext context{};
-
-  torch::executor::aten::ne_outf(context, a, b, out);
-  EXPECT_TENSOR_EQ(out, tf_bool.make({2, 2}, {false, true, false, true}));
-}
-
-TEST(OpNeTest, AllDtypesSupported) {
+TEST_F(OpNeTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
@@ -137,7 +141,7 @@ dtype = "ScalarType::Int"
 out_dtype = "ScalarType::Bool"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpNeScalarOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpNeScalarOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op_out_dtype) */
@@ -156,7 +160,7 @@ TEST(OpNeScalarOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpNeScalarOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpNeScalarOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   /* %python
   out_args = "{10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op_out_dtype) */
@@ -175,7 +179,7 @@ TEST(OpNeScalarOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpNeScalarOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpNeScalarOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_neg_test.cpp b/kernels/test/op_neg_test.cpp
index e05da08571..09bbb8b6af 100644
--- a/kernels/test/op_neg_test.cpp
+++ b/kernels/test/op_neg_test.cpp
@@ -19,12 +19,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_neg_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::neg_outf(context, self, out);
-}
-
-TEST(OpNegTest, SanityCheck) {
+class OpNegTest : public OperatorTest {
+ protected:
+  Tensor& op_neg_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::neg_outf(context_, self, out);
+  }
+};
+
+TEST_F(OpNegTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor in = tf.make({1, 7}, {-3.0, -2.5, -1.01, 0.0, 1.01, 2.5, 3.0});
diff --git a/kernels/test/op_nonzero_test.cpp b/kernels/test/op_nonzero_test.cpp
index 2f8e389a82..2eb828413e 100644
--- a/kernels/test/op_nonzero_test.cpp
+++ b/kernels/test/op_nonzero_test.cpp
@@ -18,37 +18,39 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_nonzero_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::nonzero_outf(context, self, out);
-}
+class OpNonzeroTest : public OperatorTest {
+ protected:
+  Tensor& op_nonzero_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::nonzero_outf(context_, self, out);
+  }
 
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf_input;
-  TensorFactory<ScalarType::Long> tf_long;
-  // clang-format off
-  Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 0,
-                                                       2, 4});
-  // clang-format on
-  Tensor out = tf_long.zeros({3, 2});
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf_input;
+    TensorFactory<ScalarType::Long> tf_long;
+    // clang-format off
+    Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 0,
+                                                         2, 4});
+    // clang-format on
+    Tensor out = tf_long.zeros({3, 2});
 
-  op_nonzero_out(a, out);
-  // clang-format off
-  EXPECT_TENSOR_EQ(out, tf_long.make({3, 2}, {0, 0,
-                                              1, 0,
-                                              1, 1}));
-  // clang-format on
-}
+    op_nonzero_out(a, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(out, tf_long.make({3, 2}, {0, 0,
+                                                1, 0,
+                                                1, 1}));
+    // clang-format on
+  }
+};
 
-TEST(OpNonzeroTest, AllDtypesSupported) {
+TEST_F(OpNonzeroTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 #if !defined(USE_ATEN_LIB)
-TEST(OpNonzeroTest, StaticShapeInconsistentSize) {
+TEST_F(OpNonzeroTest, StaticShapeInconsistentSize) {
   TensorFactory<ScalarType::Float> tf_input;
   TensorFactory<ScalarType::Long> tf_long;
   // clang-format off
@@ -60,10 +62,10 @@ TEST(OpNonzeroTest, StaticShapeInconsistentSize) {
   Tensor out =
       tf_long.zeros({4, 2}, torch::executor::TensorShapeDynamism::STATIC);
 
-  ET_EXPECT_KERNEL_FAILURE(op_nonzero_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_nonzero_out(a, out));
 }
 
-TEST(OpNonzeroTest, DynamicShape) {
+TEST_F(OpNonzeroTest, DynamicShape) {
   TensorFactory<ScalarType::Float> tf_input;
   TensorFactory<ScalarType::Long> tf_long;
   // clang-format off
@@ -81,7 +83,7 @@ TEST(OpNonzeroTest, DynamicShape) {
   // clang-format on
 }
 
-TEST(OpNonzeroTest, DynamicShapeInsufficientBuffer) {
+TEST_F(OpNonzeroTest, DynamicShapeInsufficientBuffer) {
   TensorFactory<ScalarType::Float> tf_input;
   TensorFactory<ScalarType::Long> tf_long;
   // clang-format off
@@ -91,6 +93,6 @@ TEST(OpNonzeroTest, DynamicShapeInsufficientBuffer) {
   Tensor out = tf_long.zeros(
       {2, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 
-  ET_EXPECT_KERNEL_FAILURE(op_nonzero_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_nonzero_out(a, out));
 }
 #endif
diff --git a/kernels/test/op_ones_test.cpp b/kernels/test/op_ones_test.cpp
index 54ed47923a..95593065e9 100644
--- a/kernels/test/op_ones_test.cpp
+++ b/kernels/test/op_ones_test.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
@@ -19,28 +20,30 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_ones_out(IntArrayRef size, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::ones_outf(context, size, out);
-}
+class OpOnesOutTest : public OperatorTest {
+ protected:
+  Tensor& op_ones_out(IntArrayRef size, Tensor& out) {
+    return torch::executor::aten::ones_outf(context_, size, out);
+  }
 
-template <ScalarType DTYPE>
-void test_ones_out(std::vector<int32_t>&& size_int32_t) {
-  TensorFactory<DTYPE> tf;
-  std::vector<int64_t> size_int64_t(size_int32_t.begin(), size_int32_t.end());
-  auto aref = IntArrayRef(size_int64_t.data(), size_int64_t.size());
+  template <ScalarType DTYPE>
+  void test_ones_out(std::vector<int32_t>&& size_int32_t) {
+    TensorFactory<DTYPE> tf;
+    std::vector<int64_t> size_int64_t(size_int32_t.begin(), size_int32_t.end());
+    auto aref = IntArrayRef(size_int64_t.data(), size_int64_t.size());
 
-  // Before: `out` consists of 0s.
-  Tensor out = tf.zeros(size_int32_t);
+    // Before: `out` consists of 0s.
+    Tensor out = tf.zeros(size_int32_t);
 
-  // After: `out` consists of 1s.
-  op_ones_out(aref, out);
+    // After: `out` consists of 1s.
+    op_ones_out(aref, out);
 
-  EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t));
-}
+    EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t));
+  }
+};
 
 #define GENERATE_TEST(_, DTYPE)                  \
-  TEST(OpOnesOutKernelTest, DTYPE##Tensors) {    \
+  TEST_F(OpOnesOutTest, DTYPE##Tensors) {        \
     test_ones_out<ScalarType::DTYPE>({});        \
     test_ones_out<ScalarType::DTYPE>({1});       \
     test_ones_out<ScalarType::DTYPE>({1, 1, 1}); \
diff --git a/kernels/test/op_permute_copy_test.cpp b/kernels/test/op_permute_copy_test.cpp
index 8b10efce60..b2f12115c6 100644
--- a/kernels/test/op_permute_copy_test.cpp
+++ b/kernels/test/op_permute_copy_test.cpp
@@ -22,12 +22,15 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_permute_copy_out(const Tensor& self, IntArrayRef dims, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::permute_copy_outf(context, self, dims, out);
-}
+class OpPermuteCopyTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_permute_copy_out(const Tensor& self, IntArrayRef dims, Tensor& out) {
+    return torch::executor::aten::permute_copy_outf(context_, self, dims, out);
+  }
+};
 
-TEST(OpPermuteCopyKernelTest, OneDPermute) {
+TEST_F(OpPermuteCopyTest, OneDPermute) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {0};
@@ -42,7 +45,7 @@ TEST(OpPermuteCopyKernelTest, OneDPermute) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, {1, 2}));
 }
 
-TEST(OpPermuteCopyKernelTest, PermuteWithNoDataReorder) {
+TEST_F(OpPermuteCopyTest, PermuteWithNoDataReorder) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {1, 0, 2};
@@ -69,7 +72,7 @@ TEST(OpPermuteCopyKernelTest, PermuteWithNoDataReorder) {
   // clang-format on
 }
 
-TEST(OpPermuteCopyKernelTest, TwoDPermute) {
+TEST_F(OpPermuteCopyTest, TwoDPermute) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {1, 0};
@@ -97,7 +100,7 @@ TEST(OpPermuteCopyKernelTest, TwoDPermute) {
   // clang-format on
 }
 
-TEST(OpPermuteCopyKernelTest, ThreeDPermute) {
+TEST_F(OpPermuteCopyTest, ThreeDPermute) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {2, 0, 1};
@@ -131,7 +134,7 @@ TEST(OpPermuteCopyKernelTest, ThreeDPermute) {
   // clang-format on
 }
 
-TEST(OpPermuteCopyKernelTest, FourDPermute) {
+TEST_F(OpPermuteCopyTest, FourDPermute) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {0, 3, 2, 1};
@@ -218,7 +221,7 @@ TEST(OpPermuteCopyKernelTest, FourDPermute) {
   // clang-format on
 }
 
-TEST(OpPermuteCopyKernelTest, FiveDPermute) {
+TEST_F(OpPermuteCopyTest, FiveDPermute) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {4, 3, 2, 1, 0};
@@ -298,7 +301,7 @@ TEST(OpPermuteCopyKernelTest, FiveDPermute) {
   // clang-format on
 }
 
-TEST(OpPermuteCopyKernelTest, AllDimensionsSizeOne) {
+TEST_F(OpPermuteCopyTest, AllDimensionsSizeOne) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {4, 3, 2, 1, 0};
@@ -313,7 +316,7 @@ TEST(OpPermuteCopyKernelTest, AllDimensionsSizeOne) {
   EXPECT_TENSOR_EQ(out, tf.make(sizes, {1}));
 }
 
-TEST(OpPermuteCopyKernelTest, DupeDimensionPos) {
+TEST_F(OpPermuteCopyTest, DupeDimensionPos) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {0, 1, 1};
@@ -323,11 +326,13 @@ TEST(OpPermuteCopyKernelTest, DupeDimensionPos) {
 
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_permute_copy_out(
-      t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_permute_copy_out(
+          t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
 }
 
-TEST(OpPermuteCopyKernelTest, DupeDimensionPos2) {
+TEST_F(OpPermuteCopyTest, DupeDimensionPos2) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {1, 1, 1};
@@ -337,11 +342,13 @@ TEST(OpPermuteCopyKernelTest, DupeDimensionPos2) {
 
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_permute_copy_out(
-      t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_permute_copy_out(
+          t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
 }
 
-TEST(OpPermuteCopyKernelTest, DupeDimensionNeg) {
+TEST_F(OpPermuteCopyTest, DupeDimensionNeg) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {0, 1, -2};
@@ -351,11 +358,13 @@ TEST(OpPermuteCopyKernelTest, DupeDimensionNeg) {
 
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_permute_copy_out(
-      t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_permute_copy_out(
+          t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
 }
 
-TEST(OpPermuteCopyKernelTest, DupeDimensionNeg2) {
+TEST_F(OpPermuteCopyTest, DupeDimensionNeg2) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {0, 1, -5};
@@ -365,11 +374,13 @@ TEST(OpPermuteCopyKernelTest, DupeDimensionNeg2) {
 
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_permute_copy_out(
-      t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_permute_copy_out(
+          t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
 }
 
-TEST(OpPermuteCopyKernelTest, MismatchDim) {
+TEST_F(OpPermuteCopyTest, MismatchDim) {
   TensorFactory<ScalarType::Int> tf;
 
   const std::vector<int64_t> new_dim = {0, 1, 2};
@@ -379,8 +390,10 @@ TEST(OpPermuteCopyKernelTest, MismatchDim) {
 
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_permute_copy_out(
-      t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_permute_copy_out(
+          t_int, ArrayRef<int64_t>(new_dim.data(), new_dim.size()), out));
 }
 
 /* %python
@@ -396,7 +409,7 @@ opt_extra_params = "perm_aref,"
 dtype = "ScalarType::Int"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpPermuteCopyKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpPermuteCopyTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{4, 2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -417,7 +430,7 @@ TEST(OpPermuteCopyKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpPermuteCopyKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpPermuteCopyTest, DynamicShapeUpperBoundLargerThanExpected) {
   /* %python
   out_args = "{5, 5, 5}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -438,7 +451,7 @@ TEST(OpPermuteCopyKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpPermuteCopyKernelTest, DynamicShapeUnbound) {
+TEST_F(OpPermuteCopyTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_pixel_shuffle_test.cpp b/kernels/test/op_pixel_shuffle_test.cpp
index 670ac7f897..01c2d878b2 100644
--- a/kernels/test/op_pixel_shuffle_test.cpp
+++ b/kernels/test/op_pixel_shuffle_test.cpp
@@ -22,42 +22,47 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_pixel_shuffle_out(const Tensor& self, int64_t upscale_factor, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::pixel_shuffle_outf(
-      context, self, upscale_factor, out);
-}
+class OpPixelShuffleOutTest : public OperatorTest {
+ protected:
+  Tensor& op_pixel_shuffle_out(
+      const Tensor& self,
+      int64_t upscale_factor,
+      Tensor& out) {
+    return torch::executor::aten::pixel_shuffle_outf(
+        context_, self, upscale_factor, out);
+  }
+
+  template <ScalarType DTYPE_IN>
+  void test_pixel_shuffle() {
+    TensorFactory<DTYPE_IN> tf_in;
+
+    const std::vector<int32_t> sizes = {1, 4, 2, 2};
+    const std::vector<int32_t> out_sizes = {1, 1, 4, 4};
+
+    // Destination for the pixel_shuffle.
+    Tensor out = tf_in.zeros(out_sizes);
+
+    op_pixel_shuffle_out(
+        tf_in.make(
+            sizes, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+        2,
+        out);
+    EXPECT_TENSOR_EQ(
+        out,
+        // Pixel shuffle distributes channels amongst the spatial dimensions.
+        tf_in.make(
+            out_sizes, {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15}));
+  }
+};
 
 //
 // Correctness Tests
 //
 
-template <ScalarType DTYPE_IN>
-void test_pixel_shuffle() {
-  TensorFactory<DTYPE_IN> tf_in;
-
-  const std::vector<int32_t> sizes = {1, 4, 2, 2};
-  const std::vector<int32_t> out_sizes = {1, 1, 4, 4};
-
-  // Destination for the pixel_shuffle.
-  Tensor out = tf_in.zeros(out_sizes);
-
-  op_pixel_shuffle_out(
-      tf_in.make(sizes, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
-      2,
-      out);
-  EXPECT_TENSOR_EQ(
-      out,
-      // Pixel shuffle distributes channels amongst the spatial dimensions.
-      tf_in.make(
-          out_sizes, {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15}));
-}
-
 /**
  * Uses the function templates above to test all input dtypes.
  */
-TEST(OpPixelShuffleOutKernelTest, AllRealDtypesSupported) {
+TEST_F(OpPixelShuffleOutTest, AllRealDtypesSupported) {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_pixel_shuffle<ScalarType::dtype>();
 
@@ -66,7 +71,7 @@ TEST(OpPixelShuffleOutKernelTest, AllRealDtypesSupported) {
 #undef ENUMERATE_TEST_ENTRY
 }
 
-TEST(OpPixelShuffleOutKernelTest, LargerInputRank) {
+TEST_F(OpPixelShuffleOutTest, LargerInputRank) {
   TensorFactory<ScalarType::Int> tf;
 
   // Pixel shuffle allows a 4D (or higher) input tensor, make sure the extra
@@ -81,7 +86,7 @@ TEST(OpPixelShuffleOutKernelTest, LargerInputRank) {
 }
 
 // Mismatched shape tests.
-TEST(OpPixelShuffleOutKernelTest, InvalidInputChannelsDies) {
+TEST_F(OpPixelShuffleOutTest, InvalidInputChannelsDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Input tensors with invalid shapes. 7 is not divisible by upsample_factor
@@ -91,10 +96,10 @@ TEST(OpPixelShuffleOutKernelTest, InvalidInputChannelsDies) {
   Tensor out = tf.zeros(/*sizes=*/{1, 1, 8, 8});
 
   // Using the wrong input shape should exit with an error code.
-  ET_EXPECT_KERNEL_FAILURE(op_pixel_shuffle_out(a, 2, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_shuffle_out(a, 2, out));
 }
 
-TEST(OpPixelShuffleOutKernelTest, WrongInputRankDies) {
+TEST_F(OpPixelShuffleOutTest, WrongInputRankDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Pixel shuffle requires a 4D input tensor.
@@ -105,10 +110,10 @@ TEST(OpPixelShuffleOutKernelTest, WrongInputRankDies) {
   Tensor out = tf.zeros(/*sizes=*/{1, 2});
 
   // Using the wrong input shape should exit with an error code.
-  ET_EXPECT_KERNEL_FAILURE(op_pixel_shuffle_out(a, 2, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_shuffle_out(a, 2, out));
 }
 
-TEST(OpPixelShuffleOutKernelTest, DifferentDtypeDies) {
+TEST_F(OpPixelShuffleOutTest, DifferentDtypeDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -118,13 +123,13 @@ TEST(OpPixelShuffleOutKernelTest, DifferentDtypeDies) {
   Tensor out = tf_float.zeros(/*sizes=*/{1, 2, 12, 12});
 
   // Using the wrong output shape should exit with an error code.
-  ET_EXPECT_KERNEL_FAILURE(op_pixel_shuffle_out(a, 3, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_shuffle_out(a, 3, out));
 }
 
-TEST(OpPixelShuffleOutKernelTest, NegativeUpscaleFactorDies) {
+TEST_F(OpPixelShuffleOutTest, NegativeUpscaleFactorDies) {
   TensorFactory<ScalarType::Int> tf;
   Tensor a = tf.ones(/*sizes=*/{1, 18, 4, 4});
   Tensor out = tf.zeros(/*sizes=*/{1, 2, 12, 12});
   // Using a negative upscale factor should exit with an error code.
-  ET_EXPECT_KERNEL_FAILURE(op_pixel_shuffle_out(a, -3, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_shuffle_out(a, -3, out));
 }
diff --git a/kernels/test/op_pow_test.cpp b/kernels/test/op_pow_test.cpp
index a0516d2e3a..8553125c7d 100644
--- a/kernels/test/op_pow_test.cpp
+++ b/kernels/test/op_pow_test.cpp
@@ -20,29 +20,29 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_pow_scalar_out(const Scalar& self, const Tensor& exponent, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::pow_outf(context, self, exponent, out);
-}
-
-Tensor& op_pow_tensor_scalar_out(
-    const Tensor& self,
-    const Scalar& exponent,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::pow_outf(context, self, exponent, out);
-}
-
-Tensor& op_pow_tensor_tensor_out(
-    const Tensor& self,
-    const Tensor& exponent,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::pow_outf(context, self, exponent, out);
-}
-
-TEST(OpPowTest, TensorTensorSanityCheck) {
+class OpPowTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_pow_scalar_out(const Scalar& self, const Tensor& exponent, Tensor& out) {
+    return torch::executor::aten::pow_outf(context_, self, exponent, out);
+  }
+
+  Tensor& op_pow_tensor_scalar_out(
+      const Tensor& self,
+      const Scalar& exponent,
+      Tensor& out) {
+    return torch::executor::aten::pow_outf(context_, self, exponent, out);
+  }
+
+  Tensor& op_pow_tensor_tensor_out(
+      const Tensor& self,
+      const Tensor& exponent,
+      Tensor& out) {
+    return torch::executor::aten::pow_outf(context_, self, exponent, out);
+  }
+};
+
+TEST_F(OpPowTest, TensorTensorSanityCheck) {
   TensorFactory<ScalarType::Byte> tf;
   Tensor self = tf.make({2, 2}, {2, 2, 2, 2});
   Tensor exp = tf.make({2, 1}, {4, 4});
@@ -54,7 +54,7 @@ TEST(OpPowTest, TensorTensorSanityCheck) {
   EXPECT_TENSOR_EQ(out, tf.make({2, 2}, {16, 16, 16, 16}));
 }
 
-TEST(OpPowTest, TensorTensorSanityCheck2) {
+TEST_F(OpPowTest, TensorTensorSanityCheck2) {
   TensorFactory<ScalarType::Float> tf1;
   TensorFactory<ScalarType::Int> tf2;
   TensorFactory<ScalarType::Double> tf3;
@@ -69,7 +69,7 @@ TEST(OpPowTest, TensorTensorSanityCheck2) {
   EXPECT_TENSOR_EQ(out, tf3.make({2, 2}, {4, 9, 16, 25}));
 }
 
-TEST(OpPowTest, TensorTensorHalfSupport) {
+TEST_F(OpPowTest, TensorTensorHalfSupport) {
   TensorFactory<ScalarType::Half> tf;
 
   Tensor self = tf.make({2, 2}, {2.0, 3.0, 4.0, 5.0});
@@ -82,7 +82,7 @@ TEST(OpPowTest, TensorTensorHalfSupport) {
   EXPECT_TENSOR_EQ(out, tf.make({2, 2}, {8.0, 27.0, 16.0, 25.0}));
 }
 
-TEST(OpPowTest, TensorScalarSanityCheck) {
+TEST_F(OpPowTest, TensorScalarSanityCheck) {
   TensorFactory<ScalarType::Byte> tf;
   Tensor self = tf.make({2, 2}, {2, 2, 2, 2});
   Tensor out = tf.make({2, 2}, {16, 16, 16, 16});
@@ -93,7 +93,7 @@ TEST(OpPowTest, TensorScalarSanityCheck) {
   EXPECT_TENSOR_EQ(out, tf.make({2, 2}, {16, 16, 16, 16}));
 }
 
-TEST(OpPowTest, TensorScalarHalfSupport) {
+TEST_F(OpPowTest, TensorScalarHalfSupport) {
   TensorFactory<ScalarType::Half> tf;
   Tensor self = tf.make({2, 2}, {2.0, 2.0, 2.0, 2.0});
   Tensor out = tf.zeros({2, 2});
@@ -104,7 +104,7 @@ TEST(OpPowTest, TensorScalarHalfSupport) {
   EXPECT_TENSOR_EQ(out, tf.make({2, 2}, {16.0, 16.0, 16.0, 16.0}));
 }
 
-TEST(OpPowTest, ScalarSanityCheck) {
+TEST_F(OpPowTest, ScalarSanityCheck) {
   TensorFactory<ScalarType::Byte> tf;
   Tensor exp = tf.make({2, 2}, {2, 2, 2, 2});
   Tensor out = tf.make({2, 2}, {16, 16, 16, 16});
@@ -115,7 +115,7 @@ TEST(OpPowTest, ScalarSanityCheck) {
   EXPECT_TENSOR_EQ(out, tf.make({2, 2}, {16, 16, 16, 16}));
 }
 
-TEST(OpPowTest, ScalarHalfSupport) {
+TEST_F(OpPowTest, ScalarHalfSupport) {
   TensorFactory<ScalarType::Half> tf;
   Tensor exp = tf.make({2, 2}, {2, 2, 2, 2});
   Tensor out = tf.zeros({2, 2});
diff --git a/kernels/test/op_reciprocal_test.cpp b/kernels/test/op_reciprocal_test.cpp
index f8072226de..b3da01c790 100644
--- a/kernels/test/op_reciprocal_test.cpp
+++ b/kernels/test/op_reciprocal_test.cpp
@@ -20,12 +20,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_reciprocal_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::reciprocal_outf(context, self, out);
-}
+class OpReciprocalTest : public OperatorTest {
+ protected:
+  Tensor& op_reciprocal_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::reciprocal_outf(context_, self, out);
+  }
+};
 
-TEST(OpReciprocalTest, SanityCheck) {
+TEST_F(OpReciprocalTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor in = tf.make({1, 7}, {-3.0, -2.99, -1.01, 0.0, 1.01, 2.99, 3.0});
@@ -40,7 +42,7 @@ TEST(OpReciprocalTest, SanityCheck) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST(OpReciprocalTest, HandleBoolInput) {
+TEST_F(OpReciprocalTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -53,7 +55,7 @@ TEST(OpReciprocalTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_reciprocal_out(a, out), res);
 }
 
-TEST(OpReciprocalTest, HandleHalfInput) {
+TEST_F(OpReciprocalTest, HandleHalfInput) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
diff --git a/kernels/test/op_relu_test.cpp b/kernels/test/op_relu_test.cpp
index fbff8d38b8..042cf13ad5 100644
--- a/kernels/test/op_relu_test.cpp
+++ b/kernels/test/op_relu_test.cpp
@@ -19,68 +19,70 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_relu_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::relu_outf(context, self, out);
-}
-
-// Common testing for relu on two floating point Tensors.
-template <ScalarType DTYPE>
-void test_relu_execution_floats() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {3, 2};
-
-  Tensor in =
-      tf.make(sizes, /*data=*/{-0.4775, 0.2948, -0.3984, 1.8690, -0.4048, 0.0});
-
-  // Destination for the relu.
-  Tensor out = tf.zeros(sizes);
-
-  // Run relu.
-  op_relu_out(in, out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_EQ(
-      out,
-      tf.make(
-          sizes,
-          /*data=*/
-          {0.0, 0.2948, 0.0, 1.8690, 0.0, 0.0}));
-}
-
-template <ScalarType DTYPE>
-void test_relu_execution_ints() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {3, 2};
-
-  Tensor in = tf.make(sizes, /*data=*/{-1, 2, 0, 3, 0, -5});
-
-  // Destination for the relu.
-  Tensor out = tf.zeros(sizes);
-
-  // Run relu.
-  op_relu_out(in, out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_EQ(
-      out,
-      tf.make(
-          sizes,
-          /*data=*/
-          {0, 2, 0, 3, 0, 0}));
-}
-
-TEST(OpReluKernelTest, FloatTensors) {
+class OpReluTest : public OperatorTest {
+ protected:
+  Tensor& op_relu_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::relu_outf(context_, self, out);
+  }
+
+  // Common testing for relu on two floating point Tensors.
+  template <ScalarType DTYPE>
+  void test_relu_execution_floats() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {3, 2};
+
+    Tensor in = tf.make(
+        sizes, /*data=*/{-0.4775, 0.2948, -0.3984, 1.8690, -0.4048, 0.0});
+
+    // Destination for the relu.
+    Tensor out = tf.zeros(sizes);
+
+    // Run relu.
+    op_relu_out(in, out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_EQ(
+        out,
+        tf.make(
+            sizes,
+            /*data=*/
+            {0.0, 0.2948, 0.0, 1.8690, 0.0, 0.0}));
+  }
+
+  template <ScalarType DTYPE>
+  void test_relu_execution_ints() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {3, 2};
+
+    Tensor in = tf.make(sizes, /*data=*/{-1, 2, 0, 3, 0, -5});
+
+    // Destination for the relu.
+    Tensor out = tf.zeros(sizes);
+
+    // Run relu.
+    op_relu_out(in, out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_EQ(
+        out,
+        tf.make(
+            sizes,
+            /*data=*/
+            {0, 2, 0, 3, 0, 0}));
+  }
+};
+
+TEST_F(OpReluTest, FloatTensors) {
   test_relu_execution_floats<ScalarType::Float>();
 }
 
-TEST(OpReluKernelTest, DoubleTensors) {
+TEST_F(OpReluTest, DoubleTensors) {
   test_relu_execution_floats<ScalarType::Double>();
 }
 
-TEST(OpReluKernelTest, ByteTensors) {
+TEST_F(OpReluTest, ByteTensors) {
   TensorFactory<ScalarType::Byte> tf;
 
   const std::vector<int32_t> sizes = {3, 2};
@@ -102,23 +104,23 @@ TEST(OpReluKernelTest, ByteTensors) {
           {1, 2, 0, 3, 0, 5}));
 }
 
-TEST(OpReluKernelTest, CharTensors) {
+TEST_F(OpReluTest, CharTensors) {
   test_relu_execution_ints<ScalarType::Char>();
 }
 
-TEST(OpReluKernelTest, ShortTensors) {
+TEST_F(OpReluTest, ShortTensors) {
   test_relu_execution_ints<ScalarType::Short>();
 }
 
-TEST(OpReluKernelTest, IntTensors) {
+TEST_F(OpReluTest, IntTensors) {
   test_relu_execution_ints<ScalarType::Int>();
 }
 
-TEST(OpReluKernelTest, LongTensors) {
+TEST_F(OpReluTest, LongTensors) {
   test_relu_execution_ints<ScalarType::Long>();
 }
 
-TEST(OpReluKernelTest, InfAndNanPreserved) {
+TEST_F(OpReluTest, InfAndNanPreserved) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {4, 2};
@@ -157,7 +159,7 @@ TEST(OpReluKernelTest, InfAndNanPreserved) {
            0.0}));
 }
 
-TEST(OpReluKernelTest, UnhandledDtypeDies) {
+TEST_F(OpReluTest, UnhandledDtypeDies) {
   // relu() doesn't handle Bool.
   TensorFactory<ScalarType::Bool> tf;
 
@@ -168,11 +170,11 @@ TEST(OpReluKernelTest, UnhandledDtypeDies) {
   // Destination for the relu.
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_relu_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_relu_out(a, out));
 }
 
 #if !defined(USE_ATEN_LIB)
-TEST(OpReluKernelTest, UpperBoundOutTensor) {
+TEST_F(OpReluTest, UpperBoundOutTensor) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {3, 2};
@@ -197,7 +199,7 @@ TEST(OpReluKernelTest, UpperBoundOutTensor) {
 }
 #endif
 
-TEST(OpReluOutKernelTest, SimpleGeneratedCase) {
+TEST_F(OpReluTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -226,7 +228,7 @@ TEST(OpReluOutKernelTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpReluOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpReluTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -252,7 +254,7 @@ TEST(OpReluOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpReluOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpReluTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -278,7 +280,7 @@ TEST(OpReluOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpReluOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpReluTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Unbound dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_remainder_test.cpp b/kernels/test/op_remainder_test.cpp
index 35515ee807..4a550958a1 100644
--- a/kernels/test/op_remainder_test.cpp
+++ b/kernels/test/op_remainder_test.cpp
@@ -20,14 +20,18 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor&
-op_remainder_tensor_out(const Tensor& self, const Tensor& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::remainder_outf(context, self, other, out);
-}
+class OpRemainderOutTest : public OperatorTest {
+  Tensor& op_remainder_tensor_out(
+      const Tensor& self,
+      const Tensor& other,
+      Tensor& out) {
+    return torch::executor::aten::remainder_outf(context_, self, other, out);
+  }
 
-Tensor&
-op_remainder_scalar_out(const Tensor& self, const Scalar& other, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::remainder_outf(context, self, other, out);
-}
+  Tensor& op_remainder_scalar_out(
+      const Tensor& self,
+      const Scalar& other,
+      Tensor& out) {
+    return torch::executor::aten::remainder_outf(context_, self, other, out);
+  }
+};
diff --git a/kernels/test/op_repeat_test.cpp b/kernels/test/op_repeat_test.cpp
index c5a10c480d..11c90e6f4c 100644
--- a/kernels/test/op_repeat_test.cpp
+++ b/kernels/test/op_repeat_test.cpp
@@ -21,70 +21,72 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_repeat_out(const Tensor& self, IntArrayRef repeats, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::repeat_outf(context, self, repeats, out);
-}
-
-template <typename CTYPE, ScalarType DTYPE>
-void run_dtype_tests() {
-  TensorFactory<DTYPE> tf;
-  // clang-format off
-  Tensor x = tf.make(
-      /*size=*/{2, 2},
-      /*data=*/{
-                  0, 1,
-                  2, 3,
-                });
-  std::vector<int64_t> repeats_vec = {3, 3, 3};
-  exec_aten::ArrayRef<int64_t> repeats = {repeats_vec.data(), repeats_vec.size()};
-  // clang-format on
-
-  // Output tensor with the shape of the input tensor x repeated
-  // - Its dimension shall equal to the length of repeat.
-  // - For any dimension i ∈ [repeat.size()-x.dim(), repeat.size()), out.size(i)
-  // = x.size(i) * repeat[i]
-  // - For any dimension i ∈ [0, repeat.size()), out.size(i) = repeat[i]
-  Tensor out = tf.zeros({3, 6, 6});
-
-  // clang-format off
-  // Repeat the input tensor along the specified `repeat` dimensions.
-  Tensor expected = tf.make(
-      /*sizes=*/ {3, 6, 6},
-      /*data=*/
-      {
-        //[0, :, :]
-        0, 1, 0, 1, 0, 1,
-        2, 3, 2, 3, 2, 3,
-        0, 1, 0, 1, 0, 1,
-        2, 3, 2, 3, 2, 3,
-        0, 1, 0, 1, 0, 1,
-        2, 3, 2, 3, 2, 3,
-
-        //[1, :, :]
-        0, 1, 0, 1, 0, 1,
-        2, 3, 2, 3, 2, 3,
-        0, 1, 0, 1, 0, 1,
-        2, 3, 2, 3, 2, 3,
-        0, 1, 0, 1, 0, 1,
-        2, 3, 2, 3, 2, 3,
-
-        //[2, :, :]
-        0, 1, 0, 1, 0, 1,
-        2, 3, 2, 3, 2, 3,
-        0, 1, 0, 1, 0, 1,
-        2, 3, 2, 3, 2, 3,
-        0, 1, 0, 1, 0, 1,
-        2, 3, 2, 3, 2, 3,
-      });
-  // clang-format on
+class OpRepeatOutTest : public OperatorTest {
+ protected:
+  Tensor& op_repeat_out(const Tensor& self, IntArrayRef repeats, Tensor& out) {
+    return torch::executor::aten::repeat_outf(context_, self, repeats, out);
+  }
 
-  Tensor ret = op_repeat_out(x, repeats, out);
-  EXPECT_TENSOR_EQ(ret, out);
-  EXPECT_TENSOR_EQ(ret, expected);
-}
+  template <typename CTYPE, ScalarType DTYPE>
+  void run_dtype_tests() {
+    TensorFactory<DTYPE> tf;
+    // clang-format off
+    Tensor x = tf.make(
+        /*size=*/{2, 2},
+        /*data=*/{
+                    0, 1,
+                    2, 3,
+                  });
+    std::vector<int64_t> repeats_vec = {3, 3, 3};
+    exec_aten::ArrayRef<int64_t> repeats = {repeats_vec.data(), repeats_vec.size()};
+    // clang-format on
+
+    // Output tensor with the shape of the input tensor x repeated
+    // - Its dimension shall equal to the length of repeat.
+    // - For any dimension i ∈ [repeat.size()-x.dim(), repeat.size()),
+    // out.size(i) = x.size(i) * repeat[i]
+    // - For any dimension i ∈ [0, repeat.size()), out.size(i) = repeat[i]
+    Tensor out = tf.zeros({3, 6, 6});
+
+    // clang-format off
+    // Repeat the input tensor along the specified `repeat` dimensions.
+    Tensor expected = tf.make(
+        /*sizes=*/ {3, 6, 6},
+        /*data=*/
+        {
+          //[0, :, :]
+          0, 1, 0, 1, 0, 1,
+          2, 3, 2, 3, 2, 3,
+          0, 1, 0, 1, 0, 1,
+          2, 3, 2, 3, 2, 3,
+          0, 1, 0, 1, 0, 1,
+          2, 3, 2, 3, 2, 3,
+  
+          //[1, :, :]
+          0, 1, 0, 1, 0, 1,
+          2, 3, 2, 3, 2, 3,
+          0, 1, 0, 1, 0, 1,
+          2, 3, 2, 3, 2, 3,
+          0, 1, 0, 1, 0, 1,
+          2, 3, 2, 3, 2, 3,
+  
+          //[2, :, :]
+          0, 1, 0, 1, 0, 1,
+          2, 3, 2, 3, 2, 3,
+          0, 1, 0, 1, 0, 1,
+          2, 3, 2, 3, 2, 3,
+          0, 1, 0, 1, 0, 1,
+          2, 3, 2, 3, 2, 3,
+        });
+    // clang-format on
+
+    Tensor ret = op_repeat_out(x, repeats, out);
+    EXPECT_TENSOR_EQ(ret, out);
+    EXPECT_TENSOR_EQ(ret, expected);
+  }
+};
 
-TEST(OpRepeatOutTest, AllDtypesSupported) {
+TEST_F(OpRepeatOutTest, AllDtypesSupported) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -93,7 +95,7 @@ TEST(OpRepeatOutTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
-TEST(OpRepeatOutTest, EmptyInputSupported) {
+TEST_F(OpRepeatOutTest, EmptyInputSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.make(
@@ -111,7 +113,7 @@ TEST(OpRepeatOutTest, EmptyInputSupported) {
   EXPECT_TENSOR_EQ(ret, expected);
 }
 
-TEST(OpRepeatOutTest, ZeroDimInputSupported) {
+TEST_F(OpRepeatOutTest, ZeroDimInputSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.make(
@@ -139,7 +141,7 @@ TEST(OpRepeatOutTest, ZeroDimInputSupported) {
   EXPECT_TENSOR_EQ(ret, expected);
 }
 
-TEST(OpRepeatOutTest, ZeroRepeatRegularInputSupported) {
+TEST_F(OpRepeatOutTest, ZeroRepeatRegularInputSupported) {
   TensorFactory<ScalarType::Int> tf;
   Tensor x = tf.make(
       /*sizes=*/{3, 2}, /*data=*/{0, 1, 2, 3, 4, 5});
@@ -156,7 +158,7 @@ TEST(OpRepeatOutTest, ZeroRepeatRegularInputSupported) {
   EXPECT_TENSOR_EQ(ret, expected);
 }
 
-TEST(OpRepeatOutTest, ZeroRepeatZeroDimInputSupported) {
+TEST_F(OpRepeatOutTest, ZeroRepeatZeroDimInputSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.make(
@@ -174,7 +176,7 @@ TEST(OpRepeatOutTest, ZeroRepeatZeroDimInputSupported) {
   EXPECT_TENSOR_EQ(ret, expected);
 }
 
-TEST(OpRepeatOutTest, RepeatTooShortDie) {
+TEST_F(OpRepeatOutTest, RepeatTooShortDie) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.make(
@@ -187,10 +189,10 @@ TEST(OpRepeatOutTest, RepeatTooShortDie) {
 
   Tensor out = tf.ones(/*sizes=*/{3, 0, 12});
 
-  ET_EXPECT_KERNEL_FAILURE(op_repeat_out(x, repeats, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_repeat_out(x, repeats, out));
 }
 
-TEST(OpRepeatOutTest, NegativeRepeatDie) {
+TEST_F(OpRepeatOutTest, NegativeRepeatDie) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.make(
@@ -203,10 +205,10 @@ TEST(OpRepeatOutTest, NegativeRepeatDie) {
 
   Tensor out = tf.ones(/*sizes=*/{3, 1});
 
-  ET_EXPECT_KERNEL_FAILURE(op_repeat_out(x, repeats, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_repeat_out(x, repeats, out));
 }
 
-TEST(OpRepeatOutTest, WrongOutputShapeDie) {
+TEST_F(OpRepeatOutTest, WrongOutputShapeDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle wrong output shape";
   }
@@ -222,10 +224,10 @@ TEST(OpRepeatOutTest, WrongOutputShapeDie) {
   // The size of output shall be [3, 15, 12].
   Tensor out = tf.ones(/*sizes=*/{3, 5, 12});
 
-  ET_EXPECT_KERNEL_FAILURE(op_repeat_out(x, repeats, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_repeat_out(x, repeats, out));
 }
 
-TEST(OpRepeatOutTest, OutputDtypeMismatchedDie) {
+TEST_F(OpRepeatOutTest, OutputDtypeMismatchedDie) {
   TensorFactory<ScalarType::Int> tf_in;
   TensorFactory<ScalarType::Float> tf_out;
 
@@ -238,12 +240,12 @@ TEST(OpRepeatOutTest, OutputDtypeMismatchedDie) {
 
   Tensor out = tf_out.ones(/*sizes=*/{7, 15, 18});
 
-  ET_EXPECT_KERNEL_FAILURE(op_repeat_out(x, repeats, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_repeat_out(x, repeats, out));
 }
 
 // Right now we only support the dimension of input and output no larger
 // than 16.
-TEST(OpRepeatOutTest, TooManyDimensionsDies) {
+TEST_F(OpRepeatOutTest, TooManyDimensionsDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle larger number of dimensions";
   }
@@ -262,11 +264,11 @@ TEST(OpRepeatOutTest, TooManyDimensionsDies) {
   output_shape.push_back(2);
   Tensor out = tf.ones(/*sizes=*/output_shape);
 
-  ET_EXPECT_KERNEL_FAILURE(op_repeat_out(x, repeats, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_repeat_out(x, repeats, out));
 }
 
 #if !defined(USE_ATEN_LIB)
-TEST(OpRepeatOutTest, UpperBoundOutTensor) {
+TEST_F(OpRepeatOutTest, UpperBoundOutTensor) {
   TensorFactory<ScalarType::Float> tf;
   // clang-format off
   Tensor x = tf.make(
@@ -338,7 +340,7 @@ opt_extra_params = "repeats,"
 dtype = "ScalarType::Int"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpRepeatOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpRepeatOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{4, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -358,7 +360,7 @@ TEST(OpRepeatOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpRepeatOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpRepeatOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -381,7 +383,7 @@ TEST(OpRepeatOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpRepeatOutTest, DynamicShapeUnbound) {
+TEST_F(OpRepeatOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_round_test.cpp b/kernels/test/op_round_test.cpp
index ef3b2fdb35..0b39aab22c 100644
--- a/kernels/test/op_round_test.cpp
+++ b/kernels/test/op_round_test.cpp
@@ -20,68 +20,71 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_round_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::round_outf(context, self, out);
-}
-
-// Common testing for round on two floating point Tensors.
-template <ScalarType DTYPE>
-void test_round_execution_floats() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {11};
-
-  Tensor in = tf.make(
-      sizes, /*data=*/{1.5, -1.5, 0, 1.5, 2.5, 3.5, 4.5, 1.4, -1.4, 1.7, -1.7});
-
-  // Destination for the round.
-  Tensor out = tf.zeros(sizes);
-
-  // Run round.
-  op_round_out(in, out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_EQ(
-      out,
-      tf.make(
-          sizes,
-          /*data=*/
-          {2.0, -2.0, 0.0, 2.0, 2.0, 4.0, 4.0, 1.0, -1.0, 2.0, -2.0}));
-}
-
-template <ScalarType DTYPE>
-void test_round_execution_ints() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {6};
-
-  Tensor in = tf.make(sizes, /*data=*/{-1, 2, 0, 3, 0, -5});
-
-  // Destination for the round.
-  Tensor out = tf.zeros(sizes);
-
-  // Run round.
-  op_round_out(in, out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_EQ(
-      out,
-      tf.make(
-          sizes,
-          /*data=*/
-          {-1, 2, 0, 3, 0, -5}));
-}
-
-TEST(OpRoundKernelTest, FloatTensors) {
+class OpRoundTest : public OperatorTest {
+ protected:
+  Tensor& op_round_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::round_outf(context_, self, out);
+  }
+
+  // Common testing for round on two floating point Tensors.
+  template <ScalarType DTYPE>
+  void test_round_execution_floats() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {11};
+
+    Tensor in = tf.make(
+        sizes,
+        /*data=*/{1.5, -1.5, 0, 1.5, 2.5, 3.5, 4.5, 1.4, -1.4, 1.7, -1.7});
+
+    // Destination for the round.
+    Tensor out = tf.zeros(sizes);
+
+    // Run round.
+    op_round_out(in, out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_EQ(
+        out,
+        tf.make(
+            sizes,
+            /*data=*/
+            {2.0, -2.0, 0.0, 2.0, 2.0, 4.0, 4.0, 1.0, -1.0, 2.0, -2.0}));
+  }
+
+  template <ScalarType DTYPE>
+  void test_round_execution_ints() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {6};
+
+    Tensor in = tf.make(sizes, /*data=*/{-1, 2, 0, 3, 0, -5});
+
+    // Destination for the round.
+    Tensor out = tf.zeros(sizes);
+
+    // Run round.
+    op_round_out(in, out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_EQ(
+        out,
+        tf.make(
+            sizes,
+            /*data=*/
+            {-1, 2, 0, 3, 0, -5}));
+  }
+};
+
+TEST_F(OpRoundTest, FloatTensors) {
   test_round_execution_floats<ScalarType::Float>();
 }
 
-TEST(OpRoundKernelTest, DoubleTensors) {
+TEST_F(OpRoundTest, DoubleTensors) {
   test_round_execution_floats<ScalarType::Double>();
 }
 
-TEST(OpRoundKernelTest, ByteTensors) {
+TEST_F(OpRoundTest, ByteTensors) {
   TensorFactory<ScalarType::Byte> tf;
 
   const std::vector<int32_t> sizes = {6};
@@ -103,23 +106,23 @@ TEST(OpRoundKernelTest, ByteTensors) {
           {1, 2, 0, 3, 0, 5}));
 }
 
-TEST(OpRoundKernelTest, CharTensors) {
+TEST_F(OpRoundTest, CharTensors) {
   test_round_execution_ints<ScalarType::Char>();
 }
 
-TEST(OpRoundKernelTest, ShortTensors) {
+TEST_F(OpRoundTest, ShortTensors) {
   test_round_execution_ints<ScalarType::Short>();
 }
 
-TEST(OpRoundKernelTest, IntTensors) {
+TEST_F(OpRoundTest, IntTensors) {
   test_round_execution_ints<ScalarType::Int>();
 }
 
-TEST(OpRoundKernelTest, LongTensors) {
+TEST_F(OpRoundTest, LongTensors) {
   test_round_execution_ints<ScalarType::Long>();
 }
 
-TEST(OpRoundKernelTest, InfAndNanPreserved) {
+TEST_F(OpRoundTest, InfAndNanPreserved) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {7};
@@ -150,7 +153,7 @@ TEST(OpRoundKernelTest, InfAndNanPreserved) {
            0.0}));
 }
 
-TEST(OpRoundKernelTest, UnhandledDtypeDies) {
+TEST_F(OpRoundTest, UnhandledDtypeDies) {
   // round() doesn't handle Bool.
   TensorFactory<ScalarType::Bool> tf;
 
@@ -161,7 +164,7 @@ TEST(OpRoundKernelTest, UnhandledDtypeDies) {
   // Destination for the round.
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_round_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_round_out(a, out));
 }
 
 /* %python
@@ -173,7 +176,7 @@ op = "op_round_out"
 dtype = "ScalarType::Float"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpRoundKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpRoundTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -196,7 +199,7 @@ TEST(OpRoundKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpRoundKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpRoundTest, DynamicShapeUpperBoundLargerThanExpected) {
   /* %python
   out_args = "{10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -219,7 +222,7 @@ TEST(OpRoundKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpRoundKernelTest, DynamicShapeUnbound) {
+TEST_F(OpRoundTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   /* %python
   out_args = "{1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND"
diff --git a/kernels/test/op_rsqrt_test.cpp b/kernels/test/op_rsqrt_test.cpp
index 91f5410251..3332e3be8e 100644
--- a/kernels/test/op_rsqrt_test.cpp
+++ b/kernels/test/op_rsqrt_test.cpp
@@ -20,12 +20,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_rsqrt_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::rsqrt_outf(context, self, out);
-}
+class OpRsqrtTest : public OperatorTest {
+ protected:
+  Tensor& op_rsqrt_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::rsqrt_outf(context_, self, out);
+  }
+};
 
-TEST(OpRsqrtTest, SanityCheck) {
+TEST_F(OpRsqrtTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor in = tf.make({1, 7}, {-3.0, -2.99, -1.01, 0.0, 1.01, 2.99, 3.0});
@@ -40,7 +42,7 @@ TEST(OpRsqrtTest, SanityCheck) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST(OpRsqrtTest, HandleBoolInput) {
+TEST_F(OpRsqrtTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -53,7 +55,7 @@ TEST(OpRsqrtTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_rsqrt_out(a, out), res);
 }
 
-TEST(OpRsqrtTest, HandleHalfInput) {
+TEST_F(OpRsqrtTest, HandleHalfInput) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
diff --git a/kernels/test/op_rsub_test.cpp b/kernels/test/op_rsub_test.cpp
index 2a68bc657e..5c1ef76628 100644
--- a/kernels/test/op_rsub_test.cpp
+++ b/kernels/test/op_rsub_test.cpp
@@ -22,57 +22,130 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_rsub_scalar_out(
-    const Tensor& self,
-    const Scalar& other,
-    const Scalar& alpha,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::rsub_outf(context, self, other, alpha, out);
-}
+class OpRSubScalarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_rsub_scalar_out(
+      const Tensor& self,
+      const Scalar& other,
+      const Scalar& alpha,
+      Tensor& out) {
+    return torch::executor::aten::rsub_outf(context_, self, other, alpha, out);
+  }
 
-// Common testing for substraction of scalar for integer Tensor.
-template <ScalarType DTYPE>
-void test_integer_rsub_scalar_out() {
-  TensorFactory<DTYPE> tf;
+  // Common testing for substraction of scalar for integer Tensor.
+  template <ScalarType DTYPE>
+  void test_integer_rsub_scalar_out() {
+    TensorFactory<DTYPE> tf;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the rsub.
-  Tensor out = tf.zeros(sizes);
+    // Destination for the rsub.
+    Tensor out = tf.zeros(sizes);
 
-  // Performs substraction of tensor from scalar.
-  op_rsub_scalar_out(
-      tf.make(sizes, /*data=*/{1, 2, 4, 5}),
-      10,
-      /*alpha=*/2,
-      out);
+    // Performs substraction of tensor from scalar.
+    op_rsub_scalar_out(
+        tf.make(sizes, /*data=*/{1, 2, 4, 5}),
+        10,
+        /*alpha=*/2,
+        out);
 
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{8, 6, 2, 0}));
-}
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{8, 6, 2, 0}));
+  }
+
+  // Common testing for substraction between floating point tensor and scalar.
+  template <ScalarType DTYPE>
+  void test_floating_point_rsub_scalar_out() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {2, 2};
+
+    // Destination for the rsub.
+    Tensor out = tf.zeros(sizes);
+
+    // Performs substraction of tensor from scalar.
+    op_rsub_scalar_out(
+        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        1.1,
+        /*alpha=*/1,
+        out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.0, -1.1, -3.3, -7.7}));
+  }
+
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  x = torch.rand(2, 3)
+  other = 10
+  alpha = 2
+  res = other - alpha * x
+  op = "op_rsub_scalar_out"
+  opt_setup_params = f"""
+    Scalar other = {other};
+    Scalar alpha = {alpha};
+  """
+  opt_extra_params = "other, alpha,"
+  out_args = "out_shape, dynamism"
+  dtype = "ScalarType::Float"
+  check = "EXPECT_TENSOR_CLOSE" */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(unary_op) */
+
+    TensorFactory<ScalarType::Float> tf;
+
+    Tensor x = tf.make(
+        {2, 3},
+        {0.49625658988952637,
+         0.7682217955589294,
+         0.08847743272781372,
+         0.13203048706054688,
+         0.30742281675338745,
+         0.6340786814689636});
+    Tensor expected = tf.make(
+        {2, 3},
+        {9.007486343383789,
+         8.463556289672852,
+         9.823044776916504,
+         9.735939025878906,
+         9.385154724121094,
+         8.731842994689941});
+
+    Scalar other = 10;
+    Scalar alpha = 2;
+
+    Tensor out = tf.zeros(out_shape, dynamism);
+    op_rsub_scalar_out(x, other, alpha, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+};
 
-TEST(OpRSubScalarOutKernelTest, ByteTensors) {
+TEST_F(OpRSubScalarOutTest, ByteTensors) {
   test_integer_rsub_scalar_out<ScalarType::Byte>();
 }
 
-TEST(OpSubOutKernelTest, CharTensors) {
+TEST_F(OpRSubScalarOutTest, CharTensors) {
   test_integer_rsub_scalar_out<ScalarType::Char>();
 }
 
-TEST(OpSubOutKernelTest, ShortTensors) {
+TEST_F(OpRSubScalarOutTest, ShortTensors) {
   test_integer_rsub_scalar_out<ScalarType::Short>();
 }
 
-TEST(OpSubOutKernelTest, IntTensors) {
+TEST_F(OpRSubScalarOutTest, IntTensors) {
   test_integer_rsub_scalar_out<ScalarType::Int>();
 }
 
-TEST(OpSubOutKernelTest, LongTensors) {
+TEST_F(OpRSubScalarOutTest, LongTensors) {
   test_integer_rsub_scalar_out<ScalarType::Long>();
 }
 
-TEST(OpRSubScalarOutKernelTest, IntTensorFloatAlphaDies) {
+TEST_F(OpRSubScalarOutTest, IntTensorFloatAlphaDies) {
   // op_rsub_scalar_out() doesn't handle floating alpha for intergal inputs
   TensorFactory<ScalarType::Int> tf;
 
@@ -84,39 +157,18 @@ TEST(OpRSubScalarOutKernelTest, IntTensorFloatAlphaDies) {
   // Subtraction operation on integral tensor with floating alpha
   // should cause an assertion and kill the test process.
   ET_EXPECT_KERNEL_FAILURE(
-      op_rsub_scalar_out(tf.ones(sizes), 0, /*alpha=*/.7, out));
+      context_, op_rsub_scalar_out(tf.ones(sizes), 0, /*alpha=*/.7, out));
 }
 
-// Common testing for substraction between floating point tensor and scalar.
-template <ScalarType DTYPE>
-void test_floating_point_rsub_scalar_out() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {2, 2};
-
-  // Destination for the rsub.
-  Tensor out = tf.zeros(sizes);
-
-  // Performs substraction of tensor from scalar.
-  op_rsub_scalar_out(
-      tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
-      1.1,
-      /*alpha=*/1,
-      out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.0, -1.1, -3.3, -7.7}));
-}
-
-TEST(OpRSubScalarOutKernelTest, FloatTensors) {
+TEST_F(OpRSubScalarOutTest, FloatTensors) {
   test_floating_point_rsub_scalar_out<ScalarType::Float>();
 }
 
-TEST(OpRSubScalarOutKernelTest, DoubleTensors) {
+TEST_F(OpRSubScalarOutTest, DoubleTensors) {
   test_floating_point_rsub_scalar_out<ScalarType::Double>();
 }
 
-TEST(OpRSubScalarOutKernelTest, UnhandledDtypeDies) {
+TEST_F(OpRSubScalarOutTest, UnhandledDtypeDies) {
   // op_rsub_scalar_out() doesn't handle Bool.
   TensorFactory<ScalarType::Bool> tf;
 
@@ -130,12 +182,13 @@ TEST(OpRSubScalarOutKernelTest, UnhandledDtypeDies) {
 
   // Subtraction operation on boolean tensor should cause an assertion and
   // kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_rsub_scalar_out(a, false, /*alpha=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_rsub_scalar_out(a, false, /*alpha=*/0, out));
 }
 
 // The output tensor may not have a dtype different from the input even if it
 // has the same shape.
-TEST(OpRSubOutKernelTest, MismatchedOutputDtypeDies) {
+TEST_F(OpRSubScalarOutTest, MismatchedOutputDtypeDies) {
   // Two different dtypes. This test uses two types with the same size to
   // demonstrate that the ScalarType itself matters, not the size of the
   // tensor elements.
@@ -152,12 +205,13 @@ TEST(OpRSubOutKernelTest, MismatchedOutputDtypeDies) {
 
   // Performing substraction of scalar from tesnor and write into a mismatched
   // output should cause an assertion and kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_rsub_scalar_out(a, 1, /*alpha=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_rsub_scalar_out(a, 1, /*alpha=*/0, out));
 }
 
 // Mismatched shape tests.
 
-TEST(OpRSubScalarOutKernelTest, MismatchedOutputShapesDies) {
+TEST_F(OpRSubScalarOutTest, MismatchedOutputShapesDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle output shapes";
   }
@@ -173,65 +227,16 @@ TEST(OpRSubScalarOutKernelTest, MismatchedOutputShapesDies) {
 
   // Performing substraction of scalar from tensor into a mismatched output
   // should cause an assertion and kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_rsub_scalar_out(a, 1, /*alpha=*/0, out));
-}
-
-/* %python
-import torch
-torch.manual_seed(0)
-x = torch.rand(2, 3)
-other = 10
-alpha = 2
-res = other - alpha * x
-op = "op_rsub_scalar_out"
-opt_setup_params = f"""
-  Scalar other = {other};
-  Scalar alpha = {alpha};
-"""
-opt_extra_params = "other, alpha,"
-out_args = "out_shape, dynamism"
-dtype = "ScalarType::Float"
-check = "EXPECT_TENSOR_CLOSE" */
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(unary_op) */
-
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor x = tf.make(
-      {2, 3},
-      {0.49625658988952637,
-       0.7682217955589294,
-       0.08847743272781372,
-       0.13203048706054688,
-       0.30742281675338745,
-       0.6340786814689636});
-  Tensor expected = tf.make(
-      {2, 3},
-      {9.007486343383789,
-       8.463556289672852,
-       9.823044776916504,
-       9.735939025878906,
-       9.385154724121094,
-       8.731842994689941});
-
-  Scalar other = 10;
-  Scalar alpha = 2;
-
-  Tensor out = tf.zeros(out_shape, dynamism);
-  op_rsub_scalar_out(x, other, alpha, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_rsub_scalar_out(a, 1, /*alpha=*/0, out));
 }
 
-TEST(OpRSubScalarOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpRSubScalarOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpRSubScalarOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpRSubScalarOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -239,7 +244,7 @@ TEST(OpRSubScalarOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpRSubScalarOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpRSubScalarOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_scalar_tensor_test.cpp b/kernels/test/op_scalar_tensor_test.cpp
index 8cd8b4a041..472fbc07c5 100644
--- a/kernels/test/op_scalar_tensor_test.cpp
+++ b/kernels/test/op_scalar_tensor_test.cpp
@@ -23,65 +23,67 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_scalar_tensor_out(const Scalar& s, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::scalar_tensor_outf(context, s, out);
-}
+class OpScalarTensorOutTest : public OperatorTest {
+ protected:
+  Tensor& op_scalar_tensor_out(const Scalar& s, Tensor& out) {
+    return torch::executor::aten::scalar_tensor_outf(context_, s, out);
+  }
 
-template <typename CTYPE, ScalarType DTYPE>
-void test_scalar_tensor_out_0d(CTYPE value) {
-  TensorFactory<DTYPE> tf;
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_scalar_tensor_out_0d(CTYPE value) {
+    TensorFactory<DTYPE> tf;
 
-  std::vector<int32_t> sizes{};
-  Tensor expected = tf.make(sizes, /*data=*/{value});
+    std::vector<int32_t> sizes{};
+    Tensor expected = tf.make(sizes, /*data=*/{value});
 
-  Tensor out = tf.ones(sizes);
-  op_scalar_tensor_out(value, out);
+    Tensor out = tf.ones(sizes);
+    op_scalar_tensor_out(value, out);
 
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-#define GENERATE_TEST_0D(ctype, dtype)                      \
-  TEST(OpScalarTensorOutKernelTest, dtype##TensorsDim0) {   \
-    test_scalar_tensor_out_0d<ctype, ScalarType::dtype>(4); \
-    test_scalar_tensor_out_0d<ctype, ScalarType::dtype>(8); \
-    test_scalar_tensor_out_0d<ctype, ScalarType::dtype>(9); \
+    EXPECT_TENSOR_EQ(out, expected);
   }
 
-ET_FORALL_REAL_TYPES(GENERATE_TEST_0D)
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_scalar_tensor_out_1d(CTYPE value) {
+    TensorFactory<DTYPE> tf;
 
-template <typename CTYPE, ScalarType DTYPE>
-void test_scalar_tensor_out_1d(CTYPE value) {
-  TensorFactory<DTYPE> tf;
+    std::vector<int32_t> sizes{1};
+    Tensor out = tf.ones(sizes);
 
-  std::vector<int32_t> sizes{1};
-  Tensor out = tf.ones(sizes);
+    ET_EXPECT_KERNEL_FAILURE(context_, op_scalar_tensor_out(value, out));
+  }
 
-  ET_EXPECT_KERNEL_FAILURE(op_scalar_tensor_out(value, out));
-}
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_scalar_tensor_out_2d(CTYPE value) {
+    TensorFactory<DTYPE> tf;
 
-template <typename CTYPE, ScalarType DTYPE>
-void test_scalar_tensor_out_2d(CTYPE value) {
-  TensorFactory<DTYPE> tf;
+    std::vector<int32_t> sizes{1, 1};
+    Tensor out = tf.ones(sizes);
 
-  std::vector<int32_t> sizes{1, 1};
-  Tensor out = tf.ones(sizes);
+    ET_EXPECT_KERNEL_FAILURE(context_, op_scalar_tensor_out(value, out));
+  }
 
-  ET_EXPECT_KERNEL_FAILURE(op_scalar_tensor_out(value, out));
-}
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_scalar_tensor_out_3d(CTYPE value) {
+    TensorFactory<DTYPE> tf;
 
-template <typename CTYPE, ScalarType DTYPE>
-void test_scalar_tensor_out_3d(CTYPE value) {
-  TensorFactory<DTYPE> tf;
+    std::vector<int32_t> sizes{1, 1, 1};
+    Tensor out = tf.ones(sizes);
 
-  std::vector<int32_t> sizes{1, 1, 1};
-  Tensor out = tf.ones(sizes);
+    ET_EXPECT_KERNEL_FAILURE(context_, op_scalar_tensor_out(value, out));
+  }
+};
 
-  ET_EXPECT_KERNEL_FAILURE(op_scalar_tensor_out(value, out));
-}
+#define GENERATE_TEST_0D(ctype, dtype)                      \
+  TEST_F(OpScalarTensorOutTest, dtype##TensorsDim0) {       \
+    test_scalar_tensor_out_0d<ctype, ScalarType::dtype>(4); \
+    test_scalar_tensor_out_0d<ctype, ScalarType::dtype>(8); \
+    test_scalar_tensor_out_0d<ctype, ScalarType::dtype>(9); \
+  }
+
+ET_FORALL_REAL_TYPES(GENERATE_TEST_0D)
 
 #define GENERATE_TEST(ctype, dtype)                                    \
-  TEST(OpScalarTensorOutKernelTest, dtype##Tensors) {                  \
+  TEST_F(OpScalarTensorOutTest, dtype##Tensors) {                      \
     if (torch::executor::testing::SupportedFeatures::get()->is_aten) { \
       GTEST_SKIP() << "ATen kernel resizes output to shape {}";        \
     }                                                                  \
@@ -98,7 +100,7 @@ void test_scalar_tensor_out_3d(CTYPE value) {
 
 ET_FORALL_REAL_TYPES(GENERATE_TEST)
 
-TEST(OpScalarTensorOutKernelTest, InvalidOutShapeFails) {
+TEST_F(OpScalarTensorOutTest, InvalidOutShapeFails) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel will reshape output";
   }
@@ -107,5 +109,5 @@ TEST(OpScalarTensorOutKernelTest, InvalidOutShapeFails) {
   std::vector<int32_t> sizes{1, 2, 1};
 
   Tensor out = tf.ones(sizes);
-  ET_EXPECT_KERNEL_FAILURE(op_scalar_tensor_out(7, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_scalar_tensor_out(7, out));
 }
diff --git a/kernels/test/op_scatter_add_test.cpp b/kernels/test/op_scatter_add_test.cpp
index 3ae3fa3ed5..d03d77715e 100644
--- a/kernels/test/op_scatter_add_test.cpp
+++ b/kernels/test/op_scatter_add_test.cpp
@@ -22,139 +22,270 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_scatter_add_out(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    const Tensor& src,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::scatter_add_outf(
-      context, self, dim, index, src, out);
-}
+class OpScatterAddOutTest : public OperatorTest {
+ protected:
+  Tensor& op_scatter_add_out(
+      const Tensor& self,
+      int64_t dim,
+      const Tensor& index,
+      const Tensor& src,
+      Tensor& out) {
+    return torch::executor::aten::scatter_add_outf(
+        context_, self, dim, index, src, out);
+  }
 
-// Common testing for the operator
-template <ScalarType DATA_DTYPE>
-void test_scatter_add_out() {
-  TensorFactory<ScalarType::Long> tf_index;
-  TensorFactory<DATA_DTYPE> tf_data;
-  const std::vector<int32_t> sizes = {3, 5};
-  // clang-format off
-  Tensor src = tf_data.make(
-    /*sizes=*/{2, 5},
-    {
-      1, 2, 3, 4, 5,
-      6, 7, 8, 9, 10
-    });
-  // clang-format on
-  Tensor self = tf_data.zeros(sizes);
-  Tensor out = tf_data.zeros(sizes);
-  // clang-format off
-  Tensor index = tf_index.make(
-    /*sizes=*/{2, 3},
-    {
-      0, 1, 2,
-      0, 1, 2
-    });
-  // clang-format on
+  // Common testing for the operator
+  template <ScalarType DATA_DTYPE>
+  void test_scatter_add_out() {
+    TensorFactory<ScalarType::Long> tf_index;
+    TensorFactory<DATA_DTYPE> tf_data;
+    const std::vector<int32_t> sizes = {3, 5};
+    // clang-format off
+    Tensor src = tf_data.make(
+      /*sizes=*/{2, 5},
+      {
+        1, 2, 3, 4, 5,
+        6, 7, 8, 9, 10
+      });
+    // clang-format on
+    Tensor self = tf_data.zeros(sizes);
+    Tensor out = tf_data.zeros(sizes);
+    // clang-format off
+    Tensor index = tf_index.make(
+      /*sizes=*/{2, 3},
+      {
+        0, 1, 2,
+        0, 1, 2
+      });
+    // clang-format on
+
+    // Valid input should give the expected output
+    op_scatter_add_out(self, 0, index, src, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out, tf_data.make(
+          sizes,
+          {
+            7, 0, 0,  0, 0,
+            0, 9, 0,  0, 0,
+            0, 0, 11, 0, 0
+          }));
+    // clang-format on
 
-  // Valid input should give the expected output
-  op_scatter_add_out(self, 0, index, src, out);
-  // clang-format off
-  EXPECT_TENSOR_EQ(
-      out, tf_data.make(
-        sizes,
+    // Valid input should give the expected output
+    op_scatter_add_out(self, 1, index, src, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out, tf_data.make(sizes,
         {
-          7, 0, 0,  0, 0,
-          0, 9, 0,  0, 0,
-          0, 0, 11, 0, 0
+          1, 2, 3, 0, 0,
+          6, 7, 8, 0, 0,
+          0, 0, 0, 0, 0
         }));
-  // clang-format on
 
-  // Valid input should give the expected output
-  op_scatter_add_out(self, 1, index, src, out);
-  // clang-format off
-  EXPECT_TENSOR_EQ(
-      out, tf_data.make(sizes,
+    src = tf_data.make(
+        /*sizes=*/{2, 3, 3},
+        {
+          // [0, :, :]
+          1,  2,  3,
+          4,  5,  6,
+          7,  8,  9,
+
+          // [1, :, :]
+          10, 11, 12,
+          13, 14, 15,
+          16, 17, 18
+        });
+    // clang-format on
+    self = tf_data.ones(/*sizes=*/{2, 3, 3});
+    out = tf_data.zeros(/*sizes=*/{2, 3, 3});
+    // clang-format off
+    index = tf_index.make(
+      /*sizes=*/{1, 3, 2},
       {
-        1, 2, 3, 0, 0,
-        6, 7, 8, 0, 0,
-        0, 0, 0, 0, 0
-      }));
+        0, 1,
+        1, 2,
+        0, 2
+      });
+    // clang-format on
+
+    op_scatter_add_out(self, 1, index, src, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_data.make(
+            /*sizes=*/{2, 3, 3},
+            {
+              // [0, :, :]
+              9, 1,  1,
+              5, 3,  1,
+              1, 14, 1,
+
+              // [1, :, :]
+              1, 1,  1,
+              1, 1,  1,
+              1, 1,  1
+            }));
+    // clang-format on
+
+    out = tf_data.zeros(/*sizes=*/{2, 3, 3});
+    op_scatter_add_out(self, 2, index, src, out);
+    // clang-format off
+    EXPECT_TENSOR_EQ(
+        out,
+        tf_data.make(
+            /*sizes=*/{2, 3, 3},
+            {
+              // [0, :, :]
+              2, 3, 1,
+              1, 5, 6,
+              8, 1, 9,
+
+              // [1, :, :]
+              1, 1, 1,
+              1, 1, 1,
+              1, 1, 1
+            }));
+    // clang-format on
+  }
 
-  src = tf_data.make(
-      /*sizes=*/{2, 3, 3},
+  // Invalid dimensions
+  template <ScalarType DATA_DTYPE>
+  void test_scatter_add_out_invalid_dim() {
+    TensorFactory<ScalarType::Long> tf_index;
+    TensorFactory<DATA_DTYPE> tf_data;
+    const std::vector<int32_t> sizes = {3, 5};
+    // clang-format off
+    Tensor src = tf_data.make(/*sizes=*/{2, 5},
       {
-        // [0, :, :]
-        1,  2,  3,
-        4,  5,  6,
-        7,  8,  9,
-
-        // [1, :, :]
-        10, 11, 12,
-        13, 14, 15,
-        16, 17, 18
+        1, 2, 3, 4, 5,
+        6, 7, 8, 9, 10
       });
-  // clang-format on
-  self = tf_data.ones(/*sizes=*/{2, 3, 3});
-  out = tf_data.zeros(/*sizes=*/{2, 3, 3});
-  // clang-format off
-  index = tf_index.make(
-    /*sizes=*/{1, 3, 2},
-    {
-      0, 1,
-      1, 2,
-      0, 2
-    });
-  // clang-format on
+    Tensor index = tf_index.make(/*sizes=*/{2, 3},
+      {
+        0, 1, 2,
+        0, 1, 2
+      });
+    // clang-format on
+    Tensor self = tf_data.zeros(sizes);
+    Tensor out = tf_data.zeros(sizes);
+
+    // Invalid dim should die
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_add_out(self, -3, index, src, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_add_out(self, 2, index, src, out));
+
+    // Self, index and src hsould have same number of dimensions
+    src = tf_data.zeros(/*sizes=*/{2, 2, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_add_out(self, 0, index, src, out));
+
+    src = tf_data.zeros(/*sizes=*/{5, 5});
+    index = tf_index.zeros(/*sizes=*/{2, 2, 2});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_add_out(self, 0, index, src, out));
+
+    // Size of dimension of index should be smaller than the size of that
+    // dimension of src
+    index = tf_index.zeros(/*sizes=*/{4, 6});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_add_out(self, 0, index, src, out));
+
+    // Size of dimension of index should be smaller than the size of that
+    // dimension of self if dimension != dim
+    index = tf_index.zeros(/*sizes=*/{4, 5});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_add_out(self, 1, index, src, out));
+
+    // Index out of bound for self in dim
+    index = tf_index.make(/*sizes=*/{2, 3}, {0, 1, 3, 0, 1, 3});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_add_out(self, 0, index, src, out));
+  }
 
-  op_scatter_add_out(self, 1, index, src, out);
-  // clang-format off
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_data.make(
-          /*sizes=*/{2, 3, 3},
-          {
-            // [0, :, :]
-            9, 1,  1,
-            5, 3,  1,
-            1, 14, 1,
-
-            // [1, :, :]
-            1, 1,  1,
-            1, 1,  1,
-            1, 1,  1
-          }));
-  // clang-format on
+  // Mismatched shape
+  template <ScalarType DATA_DTYPE>
+  void test_scatter_add_out_mismatched_shape() {
+    TensorFactory<ScalarType::Long> tf_index;
+    TensorFactory<DATA_DTYPE> tf_data;
 
-  out = tf_data.zeros(/*sizes=*/{2, 3, 3});
-  op_scatter_add_out(self, 2, index, src, out);
-  // clang-format off
-  EXPECT_TENSOR_EQ(
-      out,
-      tf_data.make(
-          /*sizes=*/{2, 3, 3},
-          {
-            // [0, :, :]
-            2, 3, 1,
-            1, 5, 6,
-            8, 1, 9,
-
-            // [1, :, :]
-            1, 1, 1,
-            1, 1, 1,
-            1, 1, 1
-          }));
-  // clang-format on
-}
+    // clang-format off
+    Tensor src = tf_data.make(/*sizes=*/{2, 5},
+      {
+        1, 2, 3, 4, 5,
+        6, 7, 8, 9, 10
+      });
+    Tensor index = tf_index.make(/*sizes=*/{2, 3},
+      {
+        0, 1, 2,
+        0, 1, 2
+      });
+    // clang-format on
+    Tensor self = tf_data.zeros(/*sizes=*/{3, 5});
+    Tensor out = tf_data.zeros(/*sizes=*/{2, 5});
+
+    // self and out should be of the same shape
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_scatter_add_out(self, 0, index, src, out));
+  }
 
-TEST(OpScatterAddOutKernelTest, AllValidInputOutputSupport) {
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  input_shape = (2, 3, 4)
+  input = torch.randint(10, input_shape)
+  dim = 2
+  index = torch.randint(input.size(dim), input_shape)
+  src = torch.randint(10, input_shape)
+  expected = torch.scatter_add(input, dim, index, src)
+
+  scatter_add_template = f"""
+    {declare_tensor_factory("ScalarType::Int", "tf")}
+    {declare_tensor_factory("ScalarType::Long", "tf_index")}
+
+    {declare_tensor_make_t("input", "tf")}
+    {declare_tensor_make_t("index", "tf_index")}
+    {declare_tensor_make_t("src", "tf")}
+    {declare_tensor_make_t("expected", "tf")}
+    {declare_tensor_zeros("out_shape, dynamism", "tf", "out")}
+
+    op_scatter_add_out(input, $dim$, index, src, out);
+    EXPECT_TENSOR_EQ(out, expected);""" */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(scatter_add_template) */
+
+    TensorFactory<ScalarType::Int> tf;
+    TensorFactory<ScalarType::Long> tf_index;
+
+    Tensor input = tf.make({2, 3, 4}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6,
+                                       6, 9, 8, 6, 6, 8, 4, 3, 6, 9, 1, 4});
+    Tensor index =
+        tf_index.make({2, 3, 4}, {0, 1, 1, 1, 1, 0, 1, 0, 3, 0, 3, 1,
+                                  2, 3, 3, 0, 2, 3, 0, 1, 3, 1, 3, 3});
+    Tensor src = tf.make({2, 3, 4}, {2, 1, 0, 9, 3, 1, 1, 0, 3, 6, 6, 7,
+                                     9, 6, 3, 4, 5, 0, 8, 2, 8, 2, 7, 5});
+    Tensor expected =
+        tf.make({2, 3, 4}, {6,  19, 3,  0,  4,  13, 7, 3, 13, 10, 1, 15,
+                            10, 9,  17, 15, 14, 10, 9, 3, 6,  11, 1, 24});
+    Tensor out = tf.zeros(out_shape, dynamism);
+
+    op_scatter_add_out(input, 2, index, src, out);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+TEST_F(OpScatterAddOutTest, AllValidInputOutputSupport) {
 #define TEST_ENTRY(CTYPE, DTYPE) test_scatter_add_out<ScalarType::DTYPE>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpScatterAddOutKernelTest, InfinityAndNANTest) {
+TEST_F(OpScatterAddOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Long> tf_index;
   TensorFactory<ScalarType::Float> tf_data;
   const std::vector<int32_t> sizes = {3, 5};
@@ -185,87 +316,14 @@ TEST(OpScatterAddOutKernelTest, InfinityAndNANTest) {
   // clang-format on
 }
 
-// Invalid dimensions
-template <ScalarType DATA_DTYPE>
-void test_scatter_add_out_invalid_dim() {
-  TensorFactory<ScalarType::Long> tf_index;
-  TensorFactory<DATA_DTYPE> tf_data;
-  const std::vector<int32_t> sizes = {3, 5};
-  // clang-format off
-  Tensor src = tf_data.make(/*sizes=*/{2, 5},
-    {
-      1, 2, 3, 4, 5,
-      6, 7, 8, 9, 10
-    });
-  Tensor index = tf_index.make(/*sizes=*/{2, 3},
-    {
-      0, 1, 2,
-      0, 1, 2
-    });
-  // clang-format on
-  Tensor self = tf_data.zeros(sizes);
-  Tensor out = tf_data.zeros(sizes);
-
-  // Invalid dim should die
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, -3, index, src, out));
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 2, index, src, out));
-
-  // Self, index and src hsould have same number of dimensions
-  src = tf_data.zeros(/*sizes=*/{2, 2, 2});
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 0, index, src, out));
-
-  src = tf_data.zeros(/*sizes=*/{5, 5});
-  index = tf_index.zeros(/*sizes=*/{2, 2, 2});
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 0, index, src, out));
-
-  // Size of dimension of index should be smaller than the size of that
-  // dimension of src
-  index = tf_index.zeros(/*sizes=*/{4, 6});
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 0, index, src, out));
-
-  // Size of dimension of index should be smaller than the size of that
-  // dimension of self if dimension != dim
-  index = tf_index.zeros(/*sizes=*/{4, 5});
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 1, index, src, out));
-
-  // Index out of bound for self in dim
-  index = tf_index.make(/*sizes=*/{2, 3}, {0, 1, 3, 0, 1, 3});
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 0, index, src, out));
-}
-
-TEST(OpScatterAddOutKernelTest, InvalidDimensionsDies) {
+TEST_F(OpScatterAddOutTest, InvalidDimensionsDies) {
 #define TEST_ENTRY(CTYPE, DTYPE) \
   test_scatter_add_out_invalid_dim<ScalarType::DTYPE>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-// Mismatched shape
-template <ScalarType DATA_DTYPE>
-void test_scatter_add_out_mismatched_shape() {
-  TensorFactory<ScalarType::Long> tf_index;
-  TensorFactory<DATA_DTYPE> tf_data;
-
-  // clang-format off
-  Tensor src = tf_data.make(/*sizes=*/{2, 5},
-    {
-      1, 2, 3, 4, 5,
-      6, 7, 8, 9, 10
-    });
-  Tensor index = tf_index.make(/*sizes=*/{2, 3},
-    {
-      0, 1, 2,
-      0, 1, 2
-    });
-  // clang-format on
-  Tensor self = tf_data.zeros(/*sizes=*/{3, 5});
-  Tensor out = tf_data.zeros(/*sizes=*/{2, 5});
-
-  // self and out should be of the same shape
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 0, index, src, out));
-}
-
-TEST(OpScatterAddOutKernelTest, MismatchedShapeDies) {
+TEST_F(OpScatterAddOutTest, MismatchedShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shape";
   }
@@ -275,7 +333,7 @@ TEST(OpScatterAddOutKernelTest, MismatchedShapeDies) {
 #undef TEST_ENTRY
 }
 
-TEST(OpScatterAddOutKernelTest, MismatchedInputDtypesDies) {
+TEST_F(OpScatterAddOutTest, MismatchedInputDtypesDies) {
   TensorFactory<ScalarType::Byte> tf_byte;
   TensorFactory<ScalarType::Char> tf_char;
   TensorFactory<ScalarType::Long> tf_long;
@@ -296,7 +354,8 @@ TEST(OpScatterAddOutKernelTest, MismatchedInputDtypesDies) {
   Tensor out = tf_char.zeros(sizes);
 
   // Types other than long for index should die
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 0, index, src, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_add_out(self, 0, index, src, out));
 
   // Mismatched dtype of src and self should die
   // clang-format off
@@ -306,7 +365,8 @@ TEST(OpScatterAddOutKernelTest, MismatchedInputDtypesDies) {
       6, 7, 8, 9, 10
     });
   // clang-format on
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 0, index, src, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_add_out(self, 0, index, src, out));
   // clang-format off
   src = tf_byte.make(/*sizes=*/{2, 5},
     {
@@ -318,62 +378,16 @@ TEST(OpScatterAddOutKernelTest, MismatchedInputDtypesDies) {
   out = tf_char.zeros(sizes);
 
   // Mismatched dtype of self and out should die
-  ET_EXPECT_KERNEL_FAILURE(op_scatter_add_out(self, 0, index, src, out));
-}
-
-/* %python
-import torch
-torch.manual_seed(0)
-input_shape = (2, 3, 4)
-input = torch.randint(10, input_shape)
-dim = 2
-index = torch.randint(input.size(dim), input_shape)
-src = torch.randint(10, input_shape)
-expected = torch.scatter_add(input, dim, index, src)
-
-scatter_add_template = f"""
-  {declare_tensor_factory("ScalarType::Int", "tf")}
-  {declare_tensor_factory("ScalarType::Long", "tf_index")}
-
-  {declare_tensor_make_t("input", "tf")}
-  {declare_tensor_make_t("index", "tf_index")}
-  {declare_tensor_make_t("src", "tf")}
-  {declare_tensor_make_t("expected", "tf")}
-  {declare_tensor_zeros("out_shape, dynamism", "tf", "out")}
-
-  op_scatter_add_out(input, $dim$, index, src, out);
-  EXPECT_TENSOR_EQ(out, expected);""" */
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(scatter_add_template) */
-
-  TensorFactory<ScalarType::Int> tf;
-  TensorFactory<ScalarType::Long> tf_index;
-
-  Tensor input = tf.make({2, 3, 4}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6,
-                                     6, 9, 8, 6, 6, 8, 4, 3, 6, 9, 1, 4});
-  Tensor index = tf_index.make({2, 3, 4}, {0, 1, 1, 1, 1, 0, 1, 0, 3, 0, 3, 1,
-                                           2, 3, 3, 0, 2, 3, 0, 1, 3, 1, 3, 3});
-  Tensor src = tf.make({2, 3, 4}, {2, 1, 0, 9, 3, 1, 1, 0, 3, 6, 6, 7,
-                                   9, 6, 3, 4, 5, 0, 8, 2, 8, 2, 7, 5});
-  Tensor expected =
-      tf.make({2, 3, 4}, {6,  19, 3,  0,  4,  13, 7, 3, 13, 10, 1, 15,
-                          10, 9,  17, 15, 14, 10, 9, 3, 6,  11, 1, 24});
-  Tensor out = tf.zeros(out_shape, dynamism);
-
-  op_scatter_add_out(input, 2, index, src, out);
-  EXPECT_TENSOR_EQ(out, expected);
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_scatter_add_out(self, 0, index, src, out));
 }
 
-TEST(OpScatterAddOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpScatterAddOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpScatterAddOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpScatterAddOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -381,7 +395,7 @@ TEST(OpScatterAddOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
       {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpScatterAddOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpScatterAddOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_select_copy_test.cpp b/kernels/test/op_select_copy_test.cpp
index 5577f92b9d..536b6058ce 100644
--- a/kernels/test/op_select_copy_test.cpp
+++ b/kernels/test/op_select_copy_test.cpp
@@ -23,47 +23,98 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_select_copy_int_out(
-    const Tensor& self,
-    int64_t dim,
-    int64_t index,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::select_copy_outf(
-      context, self, dim, index, out);
-}
-
-namespace {
-
-// Run the test by selecting Tensor x on given dim and all available indexes on
-// that dimension
-void run_test_cases(
-    const Tensor& x,
-    ssize_t dim,
-    const std::vector<Tensor>& expected) {
-  // Generated out tensor sharing same size and dtype with expected tensor
-  TensorFactory<ScalarType::Double> tf;
-
-  const std::vector<int32_t> out_size(
-      expected[0].sizes().begin(), expected[0].sizes().end());
-  Tensor out = tf.ones(out_size);
+class OpSelectCopyIntOutTest : public OperatorTest {
+ protected:
+  Tensor& op_select_copy_int_out(
+      const Tensor& self,
+      int64_t dim,
+      int64_t index,
+      Tensor& out) {
+    return torch::executor::aten::select_copy_outf(
+        context_, self, dim, index, out);
+  }
 
-  for (ssize_t idx = 0; idx < x.size(dim); idx++) {
-    // Should always return the provided out Tensor.
-    // The ret shall meet the expectation.
-    Tensor ret = op_select_copy_int_out(x, dim, idx, out);
-    EXPECT_TENSOR_EQ(out, ret);
-    EXPECT_TENSOR_EQ(out, expected[idx]);
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    // Based on the split defintion, if we split any dim()=3 and size(1)=2
+    // tensor along first dim to two tensors [ret_0, ret_1], the ret_0 and ret_1
+    // shall be equal to x[:, 0, :] and x[:, 1, :] e.g. x[i, 0, j] = ret_0[i, j]
+    // and x[i, 1, j] = ret_1[i, j] for any i in [-x.size(0), x.size(0)) and j
+    // in
+    // [-x.size(2), x.size(2))
+    // Therefore we design the following tensor x for test easily: it is a
+    // tensor formed by stacking tensors ones(3, 4) and zeros(3,4) along the
+    // first dim. So if we select the tensor along the first dim by the above
+    // rules, the ret_0 should be ones(3, 4) and ret_1 should be zeros(3, 4)
+
+    // clang-format off
+    Tensor x = tf.make(
+        {3, 2, 4},
+        {
+          // all ones below are from x,
+          // and all zeros are from y.
+          // [0, :, :]
+          1, 1, 1, 1, // [0, 0, :]
+          0, 0, 0, 0, // [0, 1, :]
 
-    ret = op_select_copy_int_out(x, dim, /*index=*/idx - x.size(dim), out);
-    EXPECT_TENSOR_EQ(out, ret);
+          // [1, :, :]
+          1, 1, 1, 1, // [1, 0, :]
+          0, 0, 0, 0, // [1, 1, :]
+
+          // [2, :, :]
+          1, 1, 1, 1, // [2, 0, :]
+          0, 0, 0, 0, // [2, 1, :]
+        });
+    // clang-format on
+
+    // Expected values for out_0 and ret_0 after the test are all ones(3, 4)
+    // based on the above rules. So here we set the default value of out_0 as
+    // zeros(3, 4) on purpose, to eliminate the influence to the final result
+    // from initial value. Same for out_1 and ret_1.
+
+    Tensor out_0 = tf.zeros({3, 4});
+    Tensor out_1 = tf.ones({3, 4});
+    Tensor ret_0 = op_select_copy_int_out(x, /*dim=*/1, /*index=*/0, out_0);
+    Tensor ret_1 = op_select_copy_int_out(x, /*dim=*/1, /*index=*/1, out_1);
+
+    EXPECT_TENSOR_EQ(ret_0, out_0);
+    EXPECT_TENSOR_EQ(ret_1, out_1);
+
+    EXPECT_TENSOR_EQ(ret_0, tf.ones({3, 4}));
+    EXPECT_TENSOR_EQ(ret_1, tf.zeros({3, 4}));
+  }
 
-    EXPECT_TENSOR_EQ(out, expected[idx]);
+  // Run the test by selecting Tensor x on given dim and all available indexes
+  // on that dimension
+  void run_test_cases(
+      const Tensor& x,
+      ssize_t dim,
+      const std::vector<Tensor>& expected) {
+    // Generated out tensor sharing same size and dtype with expected tensor
+    TensorFactory<ScalarType::Double> tf;
+
+    const std::vector<int32_t> out_size(
+        expected[0].sizes().begin(), expected[0].sizes().end());
+    Tensor out = tf.ones(out_size);
+
+    for (ssize_t idx = 0; idx < x.size(dim); idx++) {
+      // Should always return the provided out Tensor.
+      // The ret shall meet the expectation.
+      Tensor ret = op_select_copy_int_out(x, dim, idx, out);
+      EXPECT_TENSOR_EQ(out, ret);
+      EXPECT_TENSOR_EQ(out, expected[idx]);
+
+      ret = op_select_copy_int_out(x, dim, /*index=*/idx - x.size(dim), out);
+      EXPECT_TENSOR_EQ(out, ret);
+
+      EXPECT_TENSOR_EQ(out, expected[idx]);
+    }
   }
-}
-} // namespace
+};
 
-TEST(OpSelectCopyIntOutTest, SelectFrontDimAllIndexes) {
+TEST_F(OpSelectCopyIntOutTest, SelectFrontDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -117,7 +168,7 @@ TEST(OpSelectCopyIntOutTest, SelectFrontDimAllIndexes) {
   run_test_cases(x, /*dim=*/0, expected_rets);
 }
 
-TEST(OpSelectCopyIntOutTest, SelectMiddleDimAllIndexes) {
+TEST_F(OpSelectCopyIntOutTest, SelectMiddleDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -176,7 +227,7 @@ TEST(OpSelectCopyIntOutTest, SelectMiddleDimAllIndexes) {
   run_test_cases(x, /*dim=*/1, expected_rets);
 }
 
-TEST(OpSelectCopyIntOutTest, SelectEndDimAllIndexes) {
+TEST_F(OpSelectCopyIntOutTest, SelectEndDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -245,58 +296,7 @@ TEST(OpSelectCopyIntOutTest, SelectEndDimAllIndexes) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  // Based on the split defintion, if we split any dim()=3 and size(1)=2 tensor
-  // along first dim to two tensors [ret_0, ret_1], the ret_0 and ret_1 shall
-  // be equal to x[:, 0, :] and x[:, 1, :] e.g. x[i, 0, j] = ret_0[i, j] and
-  // x[i, 1, j] = ret_1[i, j] for any i in [-x.size(0), x.size(0)) and j in
-  // [-x.size(2), x.size(2))
-  // Therefore we design the following tensor x for test easily: it is a tensor
-  // formed by stacking tensors ones(3, 4) and zeros(3,4) along the first dim.
-  // So if we select the tensor along the first dim by the above rules, the
-  // ret_0 should be ones(3, 4) and ret_1 should be zeros(3, 4)
-
-  // clang-format off
-  Tensor x = tf.make(
-      {3, 2, 4},
-      {
-        // all ones below are from x,
-        // and all zeros are from y.
-        // [0, :, :]
-        1, 1, 1, 1, // [0, 0, :]
-        0, 0, 0, 0, // [0, 1, :]
-
-        // [1, :, :]
-        1, 1, 1, 1, // [1, 0, :]
-        0, 0, 0, 0, // [1, 1, :]
-
-        // [2, :, :]
-        1, 1, 1, 1, // [2, 0, :]
-        0, 0, 0, 0, // [2, 1, :]
-      });
-  // clang-format on
-
-  // Expected values for out_0 and ret_0 after the test are all ones(3, 4) based
-  // on the above rules. So here we set the default value of out_0 as zeros(3,
-  // 4) on purpose, to eliminate the influence to the final result from initial
-  // value. Same for out_1 and ret_1.
-
-  Tensor out_0 = tf.zeros({3, 4});
-  Tensor out_1 = tf.ones({3, 4});
-  Tensor ret_0 = op_select_copy_int_out(x, /*dim=*/1, /*index=*/0, out_0);
-  Tensor ret_1 = op_select_copy_int_out(x, /*dim=*/1, /*index=*/1, out_1);
-
-  EXPECT_TENSOR_EQ(ret_0, out_0);
-  EXPECT_TENSOR_EQ(ret_1, out_1);
-
-  EXPECT_TENSOR_EQ(ret_0, tf.ones({3, 4}));
-  EXPECT_TENSOR_EQ(ret_1, tf.zeros({3, 4}));
-}
-
-TEST(OpSelectCopyIntOutTest, AllDtypesSupported) {
+TEST_F(OpSelectCopyIntOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
@@ -315,7 +315,7 @@ TEST(OpSelectCopyIntOutTest, AllDtypesSupported) {
 // In this test we are gonnna find if our select function support vector tensor
 // input and empty-size tensor output. Such combination is quite normal in real
 // world (e.g. select(torch.range(10), 0, 5, out) == tensor(5))
-TEST(OpSelectCopyIntOutTest, VectorInputSupported) {
+TEST_F(OpSelectCopyIntOutTest, VectorInputSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.make({10}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
@@ -332,7 +332,7 @@ TEST(OpSelectCopyIntOutTest, VectorInputSupported) {
 
 // This test focuses on the support for empty tensor (dim() > 0) input and empty
 // tensor output
-TEST(OpSelectCopyIntOutTest, EmptyTensorNonZeroNDimsInputSupported) {
+TEST_F(OpSelectCopyIntOutTest, EmptyTensorNonZeroNDimsInputSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   // Using empty tensors as input.
@@ -350,7 +350,7 @@ TEST(OpSelectCopyIntOutTest, EmptyTensorNonZeroNDimsInputSupported) {
 }
 
 // Apply select on dim() == 0 empty tensor input and empty tensor output
-TEST(OpSelectCopyIntOutTest, EmptyTensorZeroNDimsInputDies) {
+TEST_F(OpSelectCopyIntOutTest, EmptyTensorZeroNDimsInputDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Using empty tensors as input.
@@ -364,11 +364,11 @@ TEST(OpSelectCopyIntOutTest, EmptyTensorZeroNDimsInputDies) {
   // Expected failure when slicing on the dimension with length 0 since no space
   // on the dimension could be sliced. (out of bound error)
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_copy_int_out(x, /*dim=*/0, /*index=*/0, out));
+      context_, op_select_copy_int_out(x, /*dim=*/0, /*index=*/0, out));
 }
 ///////////////////////////////////////////////////////////////////////
 
-TEST(OpSelectCopyIntOutTest, DimOutOfBoundDies) {
+TEST_F(OpSelectCopyIntOutTest, DimOutOfBoundDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.ones({1, 1, 1});
@@ -377,11 +377,12 @@ TEST(OpSelectCopyIntOutTest, DimOutOfBoundDies) {
   // Some invalid dim values.
   const std::vector<int32_t> invalid_dims = {3, 4, 5, -4, -5, -6};
   for (ssize_t dim : invalid_dims) {
-    ET_EXPECT_KERNEL_FAILURE(op_select_copy_int_out(x, dim, /*index=*/0, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_select_copy_int_out(x, dim, /*index=*/0, out));
   }
 }
 
-TEST(OpSelectCopyIntOutTest, MismatchedDtypesDies) {
+TEST_F(OpSelectCopyIntOutTest, MismatchedDtypesDies) {
   TensorFactory<ScalarType::Int> tf_int;
   TensorFactory<ScalarType::Float> tf_float;
   Tensor x = tf_int.zeros({1, 2, 2});
@@ -390,10 +391,10 @@ TEST(OpSelectCopyIntOutTest, MismatchedDtypesDies) {
   Tensor out = tf_float.ones({2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_copy_int_out(x, /*dim=*/0, /*index=*/0, out));
+      context_, op_select_copy_int_out(x, /*dim=*/0, /*index=*/0, out));
 }
 
-TEST(OpSelectCopyIntOutTest, OutMatchNumelLackDimAtEndDies) {
+TEST_F(OpSelectCopyIntOutTest, OutMatchNumelLackDimAtEndDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -405,10 +406,10 @@ TEST(OpSelectCopyIntOutTest, OutMatchNumelLackDimAtEndDies) {
   Tensor out = tf.ones({2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_copy_int_out(x, /*dim=*/0, /*index=*/0, out));
+      context_, op_select_copy_int_out(x, /*dim=*/0, /*index=*/0, out));
 }
 
-TEST(OpSelectCopyIntOutTest, OutMatchNumelExtraDimAtFrontDies) {
+TEST_F(OpSelectCopyIntOutTest, OutMatchNumelExtraDimAtFrontDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -420,10 +421,10 @@ TEST(OpSelectCopyIntOutTest, OutMatchNumelExtraDimAtFrontDies) {
   Tensor out = tf.ones({1, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_copy_int_out(x, /*dim=*/0, /*index=*/0, out));
+      context_, op_select_copy_int_out(x, /*dim=*/0, /*index=*/0, out));
 }
 
-TEST(OpSelectCopyIntOutTest, OutSizeMismatchDimDies) {
+TEST_F(OpSelectCopyIntOutTest, OutSizeMismatchDimDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -435,7 +436,7 @@ TEST(OpSelectCopyIntOutTest, OutSizeMismatchDimDies) {
   Tensor out = tf.zeros({2, 4, 7});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_copy_int_out(x, /*dim=*/2, /*index=*/3, out));
+      context_, op_select_copy_int_out(x, /*dim=*/2, /*index=*/3, out));
 }
 
 /* %python
@@ -448,7 +449,7 @@ opt_extra_params = "1, 2,"
 dtype = "ScalarType::Float"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpSelectCopyIntOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpSelectCopyIntOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{2, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -482,7 +483,7 @@ TEST(OpSelectCopyIntOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpSelectCopyIntOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpSelectCopyIntOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -519,7 +520,7 @@ TEST(OpSelectCopyIntOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpSelectCopyIntOutTest, DynamicShapeUnbound) {
+TEST_F(OpSelectCopyIntOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_select_scatter_test.cpp b/kernels/test/op_select_scatter_test.cpp
index 9641ef3682..038d00afbf 100644
--- a/kernels/test/op_select_scatter_test.cpp
+++ b/kernels/test/op_select_scatter_test.cpp
@@ -23,49 +23,146 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_select_scatter_out(
-    const Tensor& self,
-    const Tensor& src,
-    int64_t dim,
-    int64_t index,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::select_scatter_outf(
-      context, self, src, dim, index, out);
-}
+class OpSelectScatterOutTest : public OperatorTest {
+ protected:
+  Tensor& op_select_scatter_out(
+      const Tensor& self,
+      const Tensor& src,
+      int64_t dim,
+      int64_t index,
+      Tensor& out) {
+    return torch::executor::aten::select_scatter_outf(
+        context_, self, src, dim, index, out);
+  }
 
-namespace {
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
 
-// Run the test by selecting Tensor x on given dim and all available indexes on
-// that dimension
-void run_test_cases(
-    const Tensor& x,
-    const Tensor& src,
-    ssize_t dim,
-    const std::vector<Tensor>& expected) {
-  // Generated out tensor sharing same size and dtype with expected tensor
-  TensorFactory<ScalarType::Double> tf;
+    // Using the following tensors, inserting a tensor of either ones or zeros
+    // into the appropriate selected slice should result in a tensor of all ones
+    // or all zeros.
 
-  const std::vector<int32_t> out_size(
-      expected[0].sizes().begin(), expected[0].sizes().end());
-  Tensor out = tf.zeros(out_size);
+    // clang-format off
+    Tensor x = tf.make(
+        {3, 2, 4},
+        {
+          // all ones below are from x,
+          // and all zeros are from y.
+          // [0, :, :]
+          1, 1, 1, 1, // [0, 0, :]
+          0, 0, 0, 0, // [0, 1, :]
 
-  for (ssize_t idx = 0; idx < x.size(dim); idx++) {
-    // Should always return the provided out Tensor.
-    // The ret shall meet the expectation.
-    Tensor ret = op_select_scatter_out(x, src, dim, idx, out);
-    EXPECT_TENSOR_EQ(out, ret);
-    EXPECT_TENSOR_EQ(out, expected[idx]);
+          // [1, :, :]
+          1, 1, 1, 1, // [1, 0, :]
+          0, 0, 0, 0, // [1, 1, :]
+
+          // [2, :, :]
+          1, 1, 1, 1, // [2, 0, :]
+          0, 0, 0, 0, // [2, 1, :]
+        });
+    // clang-format on
+
+    // clang-format off
+    Tensor src_ones = tf.make(
+        {3, 4},
+        {
+            // [:, :]
+            1,  1,  1,  1, // [0, :]
+            1,  1,  1,  1, // [1, :]
+            1,  1,  1,  1, // [2, :]
+        });
+    // clang-format on
+
+    // clang-format off
+    Tensor src_zeros = tf.make(
+        {3, 4},
+        {
+            // [:, :]
+            0,  0,  0,  0, // [0, :]
+            0,  0,  0,  0, // [1, :]
+            0,  0,  0,  0, // [2, :]
+        });
+    // clang-format on
+
+    // Expected outs should be all ones or all zeros depending on which src
+    // tensor is used.
+
+    Tensor out_0 = tf.zeros({3, 2, 4});
+    Tensor out_1 = tf.ones({3, 2, 4});
+    Tensor ret_0 =
+        op_select_scatter_out(x, src_zeros, /*dim=*/1, /*index=*/0, out_0);
+    Tensor ret_1 =
+        op_select_scatter_out(x, src_ones, /*dim=*/1, /*index=*/1, out_1);
+
+    EXPECT_TENSOR_EQ(ret_0, out_0);
+    EXPECT_TENSOR_EQ(ret_1, out_1);
+
+    EXPECT_TENSOR_EQ(ret_0, tf.zeros({3, 2, 4}));
+    EXPECT_TENSOR_EQ(ret_1, tf.ones({3, 2, 4}));
+  }
 
-    ret = op_select_scatter_out(x, src, dim, /*index=*/idx - x.size(dim), out);
-    EXPECT_TENSOR_EQ(out, ret);
-    EXPECT_TENSOR_EQ(out, expected[idx]);
+  // Run the test by selecting Tensor x on given dim and all available indexes
+  // on that dimension
+  void run_test_cases(
+      const Tensor& x,
+      const Tensor& src,
+      ssize_t dim,
+      const std::vector<Tensor>& expected) {
+    // Generated out tensor sharing same size and dtype with expected tensor
+    TensorFactory<ScalarType::Double> tf;
+
+    const std::vector<int32_t> out_size(
+        expected[0].sizes().begin(), expected[0].sizes().end());
+    Tensor out = tf.zeros(out_size);
+
+    for (ssize_t idx = 0; idx < x.size(dim); idx++) {
+      // Should always return the provided out Tensor.
+      // The ret shall meet the expectation.
+      Tensor ret = op_select_scatter_out(x, src, dim, idx, out);
+      EXPECT_TENSOR_EQ(out, ret);
+      EXPECT_TENSOR_EQ(out, expected[idx]);
+
+      ret =
+          op_select_scatter_out(x, src, dim, /*index=*/idx - x.size(dim), out);
+      EXPECT_TENSOR_EQ(out, ret);
+      EXPECT_TENSOR_EQ(out, expected[idx]);
+    }
   }
-}
 
-} // namespace
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  x = torch.randint(10, (2, 3, 2))
+  y = torch.randint(10, (3, 2))
+  dim = 0
+  index = 1
+  res = torch.select_scatter(x, y, dim, index)
+  op = "op_select_scatter_out"
+  opt_extra_params = f"""{dim}, {index},"""
+  out_args = "out_shape, dynamism"
+  dtype = "ScalarType::Int"
+  check = "EXPECT_TENSOR_CLOSE" */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(binary_op) */
+
+    TensorFactory<ScalarType::Int> tf;
+
+    Tensor x = tf.make({2, 3, 2}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6});
+    Tensor y = tf.make({3, 2}, {6, 9, 8, 6, 6, 8});
+    Tensor expected = tf.make({2, 3, 2}, {4, 9, 3, 0, 3, 9, 6, 9, 8, 6, 6, 8});
+
+    Tensor out = tf.zeros(out_shape, dynamism);
+    op_select_scatter_out(x, y, 0, 1, out);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+};
 
-TEST(OpSelectScatterOutTest, SelectFrontDimAllIndexes) {
+TEST_F(OpSelectScatterOutTest, SelectFrontDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -142,7 +239,7 @@ TEST(OpSelectScatterOutTest, SelectFrontDimAllIndexes) {
   run_test_cases(x, src, /*dim=*/0, expected_rets);
 }
 
-TEST(OpSelectScatterOutTest, SelectMiddleDimAllIndexes) {
+TEST_F(OpSelectScatterOutTest, SelectMiddleDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -232,7 +329,7 @@ TEST(OpSelectScatterOutTest, SelectMiddleDimAllIndexes) {
   run_test_cases(x, src, /*dim=*/1, expected_rets);
 }
 
-TEST(OpSelectScatterOutTest, SelectEndDimAllIndexes) {
+TEST_F(OpSelectScatterOutTest, SelectEndDimAllIndexes) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -339,7 +436,7 @@ TEST(OpSelectScatterOutTest, SelectEndDimAllIndexes) {
 
 #ifndef USE_ATEN_LIB
 // Same test as above, but this time the output size is slightly off
-TEST(OpSelectScatterOutTest, OutputDynamicShape) {
+TEST_F(OpSelectScatterOutTest, OutputDynamicShape) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -402,74 +499,7 @@ TEST(OpSelectScatterOutTest, OutputDynamicShape) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  // Using the following tensors, inserting a tensor of either ones or zeros
-  // into the appropriate selected slice should result in a tensor of all ones
-  // or all zeros.
-
-  // clang-format off
-  Tensor x = tf.make(
-      {3, 2, 4},
-      {
-        // all ones below are from x,
-        // and all zeros are from y.
-        // [0, :, :]
-        1, 1, 1, 1, // [0, 0, :]
-        0, 0, 0, 0, // [0, 1, :]
-
-        // [1, :, :]
-        1, 1, 1, 1, // [1, 0, :]
-        0, 0, 0, 0, // [1, 1, :]
-
-        // [2, :, :]
-        1, 1, 1, 1, // [2, 0, :]
-        0, 0, 0, 0, // [2, 1, :]
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor src_ones = tf.make(
-      {3, 4},
-      {
-          // [:, :]
-          1,  1,  1,  1, // [0, :]
-          1,  1,  1,  1, // [1, :]
-          1,  1,  1,  1, // [2, :]
-      });
-  // clang-format on
-
-  // clang-format off
-  Tensor src_zeros = tf.make(
-      {3, 4},
-      {
-          // [:, :]
-          0,  0,  0,  0, // [0, :]
-          0,  0,  0,  0, // [1, :]
-          0,  0,  0,  0, // [2, :]
-      });
-  // clang-format on
-
-  // Expected outs should be all ones or all zeros depending on which src
-  // tensor is used.
-
-  Tensor out_0 = tf.zeros({3, 2, 4});
-  Tensor out_1 = tf.ones({3, 2, 4});
-  Tensor ret_0 =
-      op_select_scatter_out(x, src_zeros, /*dim=*/1, /*index=*/0, out_0);
-  Tensor ret_1 =
-      op_select_scatter_out(x, src_ones, /*dim=*/1, /*index=*/1, out_1);
-
-  EXPECT_TENSOR_EQ(ret_0, out_0);
-  EXPECT_TENSOR_EQ(ret_1, out_1);
-
-  EXPECT_TENSOR_EQ(ret_0, tf.zeros({3, 2, 4}));
-  EXPECT_TENSOR_EQ(ret_1, tf.ones({3, 2, 4}));
-}
-
-TEST(OpSelectScatterOutTest, AllDtypesSupported) {
+TEST_F(OpSelectScatterOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
@@ -487,7 +517,7 @@ TEST(OpSelectScatterOutTest, AllDtypesSupported) {
 
 // This test focuses on the support for empty tensor (dim() > 0) input and empty
 // tensor output
-TEST(OpSelectScatterOutTest, EmptyTensorNonZeroNDimsInputSupported) {
+TEST_F(OpSelectScatterOutTest, EmptyTensorNonZeroNDimsInputSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   // Using empty tensors as input.
@@ -508,7 +538,7 @@ TEST(OpSelectScatterOutTest, EmptyTensorNonZeroNDimsInputSupported) {
 }
 
 // Apply select on dim() == 0 empty tensor input and empty tensor output
-TEST(OpSelectScatterOutTest, EmptyTensorZeroNDimsInputDies) {
+TEST_F(OpSelectScatterOutTest, EmptyTensorZeroNDimsInputDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Using empty tensors as input.
@@ -526,11 +556,11 @@ TEST(OpSelectScatterOutTest, EmptyTensorZeroNDimsInputDies) {
   // Expected failure when slicing on the dimension with length 0 since no space
   // on the dimension could be sliced. (out of bound error)
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_scatter_out(x, src, /*dim=*/0, /*index=*/0, out));
+      context_, op_select_scatter_out(x, src, /*dim=*/0, /*index=*/0, out));
 }
 ///////////////////////////////////////////////////////////////////////
 
-TEST(OpSelectScatterOutTest, DimOutOfBoundDies) {
+TEST_F(OpSelectScatterOutTest, DimOutOfBoundDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.ones({1, 1, 1});
@@ -542,11 +572,11 @@ TEST(OpSelectScatterOutTest, DimOutOfBoundDies) {
   const std::vector<int32_t> invalid_dims = {3, 4, 5, -4, -5, -6};
   for (ssize_t dim : invalid_dims) {
     ET_EXPECT_KERNEL_FAILURE(
-        op_select_scatter_out(x, src, dim, /*index=*/0, out));
+        context_, op_select_scatter_out(x, src, dim, /*index=*/0, out));
   }
 }
 
-TEST(OpSelectScatterOutTest, IndexOutOfBoundDies) {
+TEST_F(OpSelectScatterOutTest, IndexOutOfBoundDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.ones({1, 1, 1});
@@ -558,11 +588,11 @@ TEST(OpSelectScatterOutTest, IndexOutOfBoundDies) {
   const std::vector<int32_t> invalid_indices = {3, 4, 5, -4, -5, -6};
   for (ssize_t idx : invalid_indices) {
     ET_EXPECT_KERNEL_FAILURE(
-        op_select_scatter_out(x, src, /*dim=*/0, idx, out));
+        context_, op_select_scatter_out(x, src, /*dim=*/0, idx, out));
   }
 }
 
-TEST(OpSelectScatterOutTest, MismatchedDtypesDies) {
+TEST_F(OpSelectScatterOutTest, MismatchedDtypesDies) {
   TensorFactory<ScalarType::Int> tf_int;
   TensorFactory<ScalarType::Float> tf_float;
   Tensor x = tf_int.zeros({1, 2, 2});
@@ -572,10 +602,10 @@ TEST(OpSelectScatterOutTest, MismatchedDtypesDies) {
   Tensor out = tf_float.ones({1, 2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_scatter_out(x, src, /*dim=*/0, /*index=*/0, out));
+      context_, op_select_scatter_out(x, src, /*dim=*/0, /*index=*/0, out));
 }
 
-TEST(OpSelectScatterOutTest, SrcMatchNumelLackDimAtEndDies) {
+TEST_F(OpSelectScatterOutTest, SrcMatchNumelLackDimAtEndDies) {
   TensorFactory<ScalarType::Int> tf;
   Tensor x = tf.zeros({1, 2, 2, 1});
   // src shares the same dtype and numel as the selected slice, but the wrong
@@ -585,10 +615,10 @@ TEST(OpSelectScatterOutTest, SrcMatchNumelLackDimAtEndDies) {
   Tensor out = tf.ones({1, 2, 2, 1});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_scatter_out(x, src, /*dim=*/0, /*index=*/0, out));
+      context_, op_select_scatter_out(x, src, /*dim=*/0, /*index=*/0, out));
 }
 
-TEST(OpSelectScatterOutTest, SrcMatchNumelExtraDimAtFrontDies) {
+TEST_F(OpSelectScatterOutTest, SrcMatchNumelExtraDimAtFrontDies) {
   TensorFactory<ScalarType::Int> tf;
   Tensor x = tf.zeros({2, 2});
   // src shares the same dtype and numel as the selected slice, but the wrong
@@ -598,10 +628,10 @@ TEST(OpSelectScatterOutTest, SrcMatchNumelExtraDimAtFrontDies) {
   Tensor out = tf.ones({2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_scatter_out(x, src, /*dim=*/0, /*index=*/0, out));
+      context_, op_select_scatter_out(x, src, /*dim=*/0, /*index=*/0, out));
 }
 
-TEST(OpSelectScatterOutTest, SrcSizeMismatchDimDies) {
+TEST_F(OpSelectScatterOutTest, SrcSizeMismatchDimDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor x = tf.zeros({2, 4, 7, 5});
@@ -612,46 +642,15 @@ TEST(OpSelectScatterOutTest, SrcSizeMismatchDimDies) {
   Tensor out = tf.zeros({2, 4, 7, 5});
 
   ET_EXPECT_KERNEL_FAILURE(
-      op_select_scatter_out(x, src, /*dim=*/2, /*index=*/3, out));
-}
-
-/* %python
-import torch
-torch.manual_seed(0)
-x = torch.randint(10, (2, 3, 2))
-y = torch.randint(10, (3, 2))
-dim = 0
-index = 1
-res = torch.select_scatter(x, y, dim, index)
-op = "op_select_scatter_out"
-opt_extra_params = f"""{dim}, {index},"""
-out_args = "out_shape, dynamism"
-dtype = "ScalarType::Int"
-check = "EXPECT_TENSOR_CLOSE" */
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(binary_op) */
-
-  TensorFactory<ScalarType::Int> tf;
-
-  Tensor x = tf.make({2, 3, 2}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6});
-  Tensor y = tf.make({3, 2}, {6, 9, 8, 6, 6, 8});
-  Tensor expected = tf.make({2, 3, 2}, {4, 9, 3, 0, 3, 9, 6, 9, 8, 6, 6, 8});
-
-  Tensor out = tf.zeros(out_shape, dynamism);
-  op_select_scatter_out(x, y, 0, 1, out);
-  EXPECT_TENSOR_CLOSE(out, expected);
+      context_, op_select_scatter_out(x, src, /*dim=*/2, /*index=*/3, out));
 }
 
-TEST(OpSelectScatterOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpSelectScatterOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpSelectScatterOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpSelectScatterOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -659,7 +658,7 @@ TEST(OpSelectScatterOutTest, DynamicShapeUpperBoundLargerThanExpected) {
       {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpSelectScatterOutTest, DynamicShapeUnbound) {
+TEST_F(OpSelectScatterOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_sigmoid_test.cpp b/kernels/test/op_sigmoid_test.cpp
index 8f5f7dbe4e..d7856b7533 100644
--- a/kernels/test/op_sigmoid_test.cpp
+++ b/kernels/test/op_sigmoid_test.cpp
@@ -21,31 +21,47 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_sigmoid_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::sigmoid_outf(context, self, out);
-}
+class OpSigmoidOutTest : public OperatorTest {
+ protected:
+  Tensor& op_sigmoid_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::sigmoid_outf(context_, self, out);
+  }
 
-// Common testing for sigmoid operator
-template <ScalarType DTYPE, ScalarType OUTPUT_DTYPE>
-void test_integer_sigmoid_out() {
-  TensorFactory<DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
+  // Common testing for sigmoid operator
+  template <ScalarType DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_integer_sigmoid_out() {
+    TensorFactory<DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the sigmoid operator.
-  Tensor out = tf_out.zeros(sizes);
+    // Destination for the sigmoid operator.
+    Tensor out = tf_out.zeros(sizes);
 
-  op_sigmoid_out(tf.make(sizes, /*data=*/{1, 2, 4, 8}), out);
+    op_sigmoid_out(tf.make(sizes, /*data=*/{1, 2, 4, 8}), out);
 
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make(sizes, /*data=*/{0.731059, 0.880797, 0.982014, 0.999665}));
-}
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make(sizes, /*data=*/{0.731059, 0.880797, 0.982014, 0.999665}));
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType OUTPUT_DTYPE>
+  void test_sigmoid_invalid_output_dtype_dies() {
+    TensorFactory<ScalarType::Float> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
 
-TEST(OpSigmoidOutKernelTest, AllRealInputHalfOutputSupport) {
+    ET_EXPECT_KERNEL_FAILURE(context_, op_sigmoid_out(in, out));
+  }
+};
+
+TEST_F(OpSigmoidOutTest, AllRealInputHalfOutputSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -55,14 +71,14 @@ TEST(OpSigmoidOutKernelTest, AllRealInputHalfOutputSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSigmoidOutKernelTest, AllRealInputFloatOutputSupport) {
+TEST_F(OpSigmoidOutTest, AllRealInputFloatOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_integer_sigmoid_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpSigmoidOutKernelTest, AllRealInputDoubleOutputSupport) {
+TEST_F(OpSigmoidOutTest, AllRealInputDoubleOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_integer_sigmoid_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
@@ -70,7 +86,7 @@ TEST(OpSigmoidOutKernelTest, AllRealInputDoubleOutputSupport) {
 }
 
 // Mismatched shape tests.
-TEST(OpSigmoidOutKernelTest, MismatchedShapesDies) {
+TEST_F(OpSigmoidOutTest, MismatchedShapesDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -81,24 +97,10 @@ TEST(OpSigmoidOutKernelTest, MismatchedShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf_out.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_sigmoid_out(a, out));
-}
-
-// Unhandled output dtypes.
-template <ScalarType OUTPUT_DTYPE>
-void test_sigmoid_invalid_output_dtype_dies() {
-  TensorFactory<ScalarType::Float> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_sigmoid_out(in, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_sigmoid_out(a, out));
 }
 
-TEST(OpTanhOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpSigmoidOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_sigmoid_invalid_output_dtype_dies<ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
diff --git a/kernels/test/op_sign_test.cpp b/kernels/test/op_sign_test.cpp
index 9eaf6f69a8..e411675b19 100644
--- a/kernels/test/op_sign_test.cpp
+++ b/kernels/test/op_sign_test.cpp
@@ -20,12 +20,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_sign_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::sign_outf(context, self, out);
-}
+class OpSignTest : public OperatorTest {
+ protected:
+  Tensor& op_sign_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::sign_outf(context_, self, out);
+  }
+};
 
-TEST(OpSignTest, ETSanityCheckFloat) {
+TEST_F(OpSignTest, ETSanityCheckFloat) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen returns 0 on NAN input";
   }
@@ -41,7 +43,7 @@ TEST(OpSignTest, ETSanityCheckFloat) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST(OpSignTest, ATenSanityCheckFloat) {
+TEST_F(OpSignTest, ATenSanityCheckFloat) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ET returns NAN on NAN input";
   }
@@ -57,7 +59,7 @@ TEST(OpSignTest, ATenSanityCheckFloat) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST(OpSignTest, SanityCheckBool) {
+TEST_F(OpSignTest, SanityCheckBool) {
   TensorFactory<ScalarType::Bool> tf;
 
   Tensor in = tf.make({1, 6}, {false, true, false, false, true, true});
diff --git a/kernels/test/op_sin_test.cpp b/kernels/test/op_sin_test.cpp
index 75bfe0c2dd..74991f6cc3 100644
--- a/kernels/test/op_sin_test.cpp
+++ b/kernels/test/op_sin_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_sin_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::sin_outf(context, self, out);
-}
+class OpSinOutTest : public OperatorTest {
+ protected:
+  Tensor& op_sin_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::sin_outf(context_, self, out);
+  }
+
+  // Common testing for sin operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_sin_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the sin operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_sin_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 6}, { 0.000000,  0.841471,  0.141120, -0.958924, -0.544021, -0.506366 }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_sin_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_sin_out(in, out));
+  }
+};
 
-TEST(OpSinOutKernelTest, HandleBoolInput) {
+TEST_F(OpSinOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpSinOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_sin_out(a, out), res);
 }
 
-// Common testing for sin operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_sin_out(
-    const std::vector<int32_t>& out_shape = {1, 6},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the sin operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_sin_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 6}, { 0.000000,  0.841471,  0.141120, -0.958924, -0.544021, -0.506366 }));
-  // clang-format on
-}
-
-TEST(OpSinOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpSinOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpSinOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpSinOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_sin_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpSinOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpSinOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_sin_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpSinOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpSinOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpSinOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpSinOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                     \
   test_floating_point_sin_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpSinOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpSinOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                      \
   test_floating_point_sin_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpSinOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpSinOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpSinOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpSinOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpSinOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_sin_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_sin_out(in, out));
-}
-
-TEST(OpSinOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpSinOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_sin_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpSinOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpSinOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpSinOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpSinOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_sin_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_sin_out(a, out));
 }
diff --git a/kernels/test/op_sinh_test.cpp b/kernels/test/op_sinh_test.cpp
index aa6fd86534..8f533ad325 100644
--- a/kernels/test/op_sinh_test.cpp
+++ b/kernels/test/op_sinh_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_sinh_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::sinh_outf(context, self, out);
-}
+class OpSinhOutTest : public OperatorTest {
+ protected:
+  Tensor& op_sinh_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::sinh_outf(context_, self, out);
+  }
+
+  // Common testing for sinh operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_sinh_out(
+      const std::vector<int32_t>& out_shape = {1, 5},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the sinh operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_sinh_out(tf_in.make({1, 5}, { 0, 1, 3, 5, 10 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 5}, { 0.000000e+00, 1.175201e+00, 1.001787e+01, 7.420321e+01, 1.101323e+04 }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_sinh_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_sinh_out(in, out));
+  }
+};
 
-TEST(OpSinhOutKernelTest, HandleBoolInput) {
+TEST_F(OpSinhOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpSinhOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_sinh_out(a, out), res);
 }
 
-// Common testing for sinh operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_sinh_out(
-    const std::vector<int32_t>& out_shape = {1, 5},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the sinh operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_sinh_out(tf_in.make({1, 5}, { 0, 1, 3, 5, 10 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 5}, { 0.000000e+00, 1.175201e+00, 1.001787e+01, 7.420321e+01, 1.101323e+04 }));
-  // clang-format on
-}
-
-TEST(OpSinhOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpSinhOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpSinhOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinhOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpSinhOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpSinhOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpSinhOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpSinhOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpSinhOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpSinhOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinhOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpSinhOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                      \
   test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpSinhOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinhOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpSinhOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                       \
   test_floating_point_sinh_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpSinhOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinhOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpSinhOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpSinhOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpSinhOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpSinhOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpSinhOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_sinh_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_sinh_out(in, out));
-}
-
-TEST(OpSinhOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpSinhOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_sinh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpSinhOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpSinhOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpSinhOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpSinhOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_sinh_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_sinh_out(a, out));
 }
diff --git a/kernels/test/op_slice_copy_test.cpp b/kernels/test/op_slice_copy_test.cpp
index 408637c53d..4c04e4bf51 100644
--- a/kernels/test/op_slice_copy_test.cpp
+++ b/kernels/test/op_slice_copy_test.cpp
@@ -23,19 +23,52 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_slice_copy_tensor_out(
-    const Tensor& self,
-    int64_t dim,
-    optional<int64_t> start,
-    optional<int64_t> end,
-    int64_t step,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::slice_copy_outf(
-      context, self, dim, start, end, step, out);
-}
+class OpSliceCopyTensorOutTest : public OperatorTest {
+ protected:
+  Tensor& op_slice_copy_tensor_out(
+      const Tensor& self,
+      int64_t dim,
+      optional<int64_t> start,
+      optional<int64_t> end,
+      int64_t step,
+      Tensor& out) {
+    return torch::executor::aten::slice_copy_outf(
+        context_, self, dim, start, end, step, out);
+  }
+
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    // clang-format off
+    Tensor input = tf.make(
+      /*sizes=*/{3, 4},
+      /*data=*/{
+        1,   2,   3,   4, // [0, :]
+        5,   6,   7,   8, // [1, :]
+        9,  10,  11,  12, // [2, :]
+      });
+  
+    // op_slice_copy_tensor_out(input, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out),
+    // The result should equal to input[0:2:1, :]
+    Tensor expect_ret = tf.make(
+      /*sizes=*/{2, 4},
+      /*data=*/{
+        1,   2,   3,   4, // [0, :]
+        5,   6,   7,   8, // [1, :]
+      });
+    // clang-format on
+
+    Tensor out = tf.zeros({2, 4});
+    Tensor ret = op_slice_copy_tensor_out(
+        input, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out);
+
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(ret, expect_ret);
+  }
+};
 
-TEST(OpSliceCopyTensorOutTest, LegalDimSupported) {
+TEST_F(OpSliceCopyTensorOutTest, LegalDimSupported) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -116,7 +149,7 @@ TEST(OpSliceCopyTensorOutTest, LegalDimSupported) {
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, AllStartValsSupported) {
+TEST_F(OpSliceCopyTensorOutTest, AllStartValsSupported) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -217,7 +250,7 @@ TEST(OpSliceCopyTensorOutTest, AllStartValsSupported) {
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, AllEndValsSupported) {
+TEST_F(OpSliceCopyTensorOutTest, AllEndValsSupported) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -314,7 +347,7 @@ TEST(OpSliceCopyTensorOutTest, AllEndValsSupported) {
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, LegalStepsSupported) {
+TEST_F(OpSliceCopyTensorOutTest, LegalStepsSupported) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -394,38 +427,7 @@ TEST(OpSliceCopyTensorOutTest, LegalStepsSupported) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  // clang-format off
-  Tensor input = tf.make(
-    /*sizes=*/{3, 4},
-    /*data=*/{
-      1,   2,   3,   4, // [0, :]
-      5,   6,   7,   8, // [1, :]
-      9,  10,  11,  12, // [2, :]
-    });
-
-  // op_slice_copy_tensor_out(input, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out),
-  // The result should equal to input[0:2:1, :]
-  Tensor expect_ret = tf.make(
-    /*sizes=*/{2, 4},
-    /*data=*/{
-      1,   2,   3,   4, // [0, :]
-      5,   6,   7,   8, // [1, :]
-    });
-  // clang-format on
-
-  Tensor out = tf.zeros({2, 4});
-  Tensor ret = op_slice_copy_tensor_out(
-      input, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(ret, expect_ret);
-}
-
-TEST(OpSliceCopyTensorOutTest, AllDtypesSupported) {
+TEST_F(OpSliceCopyTensorOutTest, AllDtypesSupported) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -437,7 +439,7 @@ TEST(OpSliceCopyTensorOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpSliceCopyTensorOutTest, EmptyInputSupported) {
+TEST_F(OpSliceCopyTensorOutTest, EmptyInputSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.ones({1, 0, 1});
@@ -456,20 +458,24 @@ TEST(OpSliceCopyTensorOutTest, EmptyInputSupported) {
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, EmptySizeInputDies) {
+TEST_F(OpSliceCopyTensorOutTest, EmptySizeInputDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.ones({});
   Tensor out = tf.ones({});
 
   // The operation shall die whatever the end is.
-  ET_EXPECT_KERNEL_FAILURE(op_slice_copy_tensor_out(
-      input, /*dim=*/0, /*start=*/0, /*end=*/0, /*step=*/1, out));
-  ET_EXPECT_KERNEL_FAILURE(op_slice_copy_tensor_out(
-      input, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_slice_copy_tensor_out(
+          input, /*dim=*/0, /*start=*/0, /*end=*/0, /*step=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_slice_copy_tensor_out(
+          input, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out));
 }
 
-TEST(OpSliceCopyTensorOutTest, NonPostiveStepsDies) {
+TEST_F(OpSliceCopyTensorOutTest, NonPostiveStepsDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.ones({1, 1, 1});
@@ -478,12 +484,14 @@ TEST(OpSliceCopyTensorOutTest, NonPostiveStepsDies) {
   // Some invalid step values.
   const std::vector<int64_t> invalid_steps = {-2, -1, 0};
   for (int64_t step : invalid_steps) {
-    ET_EXPECT_KERNEL_FAILURE(op_slice_copy_tensor_out(
-        input, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/step, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_slice_copy_tensor_out(
+            input, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/step, out));
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, DimOutOfBoundDies) {
+TEST_F(OpSliceCopyTensorOutTest, DimOutOfBoundDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.ones({1, 1, 1});
@@ -492,12 +500,14 @@ TEST(OpSliceCopyTensorOutTest, DimOutOfBoundDies) {
   // Some invalid dim values.
   const std::vector<int64_t> invalid_dims = {3, 4, 5, -4, -5, -6};
   for (int64_t dim : invalid_dims) {
-    ET_EXPECT_KERNEL_FAILURE(op_slice_copy_tensor_out(
-        input, dim, /*start=*/0, /*end=*/1, /*step=*/1, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_slice_copy_tensor_out(
+            input, dim, /*start=*/0, /*end=*/1, /*step=*/1, out));
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, MismatchedDtypesDies) {
+TEST_F(OpSliceCopyTensorOutTest, MismatchedDtypesDies) {
   TensorFactory<ScalarType::Int> tf_int;
   TensorFactory<ScalarType::Float> tf_float;
   Tensor input = tf_int.zeros({1, 2, 2});
@@ -505,11 +515,13 @@ TEST(OpSliceCopyTensorOutTest, MismatchedDtypesDies) {
   // Size is compatible to the output, but a mismatched dtype.
   Tensor out = tf_float.ones({1, 2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_slice_copy_tensor_out(
-      input, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_slice_copy_tensor_out(
+          input, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out));
 }
 
-TEST(OpSliceCopyTensorOutTest, OutSizeMismatchDimDies) {
+TEST_F(OpSliceCopyTensorOutTest, OutSizeMismatchDimDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -520,11 +532,13 @@ TEST(OpSliceCopyTensorOutTest, OutSizeMismatchDimDies) {
   // Should be {2, 4, 7, 5}
   Tensor out = tf.zeros({2, 4, 7});
 
-  ET_EXPECT_KERNEL_FAILURE(op_slice_copy_tensor_out(
-      input, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_slice_copy_tensor_out(
+          input, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out));
 }
 
-TEST(OpSliceCopyTensorOutTest, DefaultStartValSupported) {
+TEST_F(OpSliceCopyTensorOutTest, DefaultStartValSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.zeros({2, 4, 7, 5});
@@ -543,7 +557,7 @@ TEST(OpSliceCopyTensorOutTest, DefaultStartValSupported) {
   EXPECT_TENSOR_EQ(ret_default_start, expected);
 }
 
-TEST(OpSliceCopyTensorOutTest, DefaultEndValSupported) {
+TEST_F(OpSliceCopyTensorOutTest, DefaultEndValSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.zeros({2, 4, 7, 5});
@@ -573,7 +587,7 @@ opt_extra_params = "1, 1, 5, 2,"
 dtype = "ScalarType::Float"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpSliceCopyTensorOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpSliceCopyTensorOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{2, 2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -615,7 +629,7 @@ TEST(OpSliceCopyTensorOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpSliceCopyTensorOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpSliceCopyTensorOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -660,7 +674,7 @@ TEST(OpSliceCopyTensorOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpSliceCopyTensorOutTest, DynamicShapeUnbound) {
+TEST_F(OpSliceCopyTensorOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_slice_scatter_test.cpp b/kernels/test/op_slice_scatter_test.cpp
index f05321dc52..5fc18c99ad 100644
--- a/kernels/test/op_slice_scatter_test.cpp
+++ b/kernels/test/op_slice_scatter_test.cpp
@@ -23,20 +23,60 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_slice_scatter_out(
-    const Tensor& self,
-    const Tensor& src,
-    int64_t dim,
-    optional<int64_t> start,
-    optional<int64_t> end,
-    int64_t step,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::slice_scatter_outf(
-      context, self, src, dim, start, end, step, out);
-}
+class OpSliceCopyTensorOutTest : public OperatorTest {
+ protected:
+  Tensor& op_slice_scatter_out(
+      const Tensor& self,
+      const Tensor& src,
+      int64_t dim,
+      optional<int64_t> start,
+      optional<int64_t> end,
+      int64_t step,
+      Tensor& out) {
+    return torch::executor::aten::slice_scatter_outf(
+        context_, self, src, dim, start, end, step, out);
+  }
+
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    // clang-format off
+    Tensor input = tf.make(
+      /*sizes=*/{3, 4},
+      /*data=*/{
+        1,   2,   3,   4, // [0, :]
+        5,   6,   7,   8, // [1, :]
+        9,  10,  11,  12, // [2, :]
+      });
+  
+    // op_slice_scatter_out(input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out),
+    // src shape should equal to input[0:2:1, :]
+    Tensor src = tf.make(
+      /*sizes=*/{2, 4},
+      /*data=*/{
+        5,   6,   7,   8, // [0, :]
+        1,   2,   3,   4, // [1, :]
+      });
+    Tensor expect_ret = tf.make(
+      /*sizes=*/{3, 4},
+      /*data=*/{
+        5,   6,   7,   8, // [0, :]
+        1,   2,   3,   4, // [1, :]
+        9,  10,  11,  12, // [2, :]
+      });
+    // clang-format on
+
+    Tensor out = tf.zeros({3, 4});
+    Tensor ret = op_slice_scatter_out(
+        input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out);
 
-TEST(OpSliceCopyTensorOutTest, LegalDimSupported) {
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(ret, expect_ret);
+  }
+};
+
+TEST_F(OpSliceCopyTensorOutTest, LegalDimSupported) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -172,7 +212,7 @@ TEST(OpSliceCopyTensorOutTest, LegalDimSupported) {
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, AllStartValsSupported) {
+TEST_F(OpSliceCopyTensorOutTest, AllStartValsSupported) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -340,7 +380,7 @@ TEST(OpSliceCopyTensorOutTest, AllStartValsSupported) {
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, AllEndValsSupported) {
+TEST_F(OpSliceCopyTensorOutTest, AllEndValsSupported) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -506,7 +546,7 @@ TEST(OpSliceCopyTensorOutTest, AllEndValsSupported) {
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, LegalStepsSupported) {
+TEST_F(OpSliceCopyTensorOutTest, LegalStepsSupported) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -628,45 +668,7 @@ TEST(OpSliceCopyTensorOutTest, LegalStepsSupported) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  // clang-format off
-  Tensor input = tf.make(
-    /*sizes=*/{3, 4},
-    /*data=*/{
-      1,   2,   3,   4, // [0, :]
-      5,   6,   7,   8, // [1, :]
-      9,  10,  11,  12, // [2, :]
-    });
-
-  // op_slice_scatter_out(input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out),
-  // src shape should equal to input[0:2:1, :]
-  Tensor src = tf.make(
-    /*sizes=*/{2, 4},
-    /*data=*/{
-      5,   6,   7,   8, // [0, :]
-      1,   2,   3,   4, // [1, :]
-    });
-  Tensor expect_ret = tf.make(
-    /*sizes=*/{3, 4},
-    /*data=*/{
-      5,   6,   7,   8, // [0, :]
-      1,   2,   3,   4, // [1, :]
-      9,  10,  11,  12, // [2, :]
-    });
-  // clang-format on
-
-  Tensor out = tf.zeros({3, 4});
-  Tensor ret = op_slice_scatter_out(
-      input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(ret, expect_ret);
-}
-
-TEST(OpSliceCopyTensorOutTest, AllRealDtypesSupported) {
+TEST_F(OpSliceCopyTensorOutTest, AllRealDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
@@ -675,7 +677,7 @@ TEST(OpSliceCopyTensorOutTest, AllRealDtypesSupported) {
   // for those types.
 }
 
-TEST(OpSliceCopyTensorOutTest, EmptyInputSupported) {
+TEST_F(OpSliceCopyTensorOutTest, EmptyInputSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.ones({1, 0, 1});
@@ -695,7 +697,7 @@ TEST(OpSliceCopyTensorOutTest, EmptyInputSupported) {
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, EmptySizeInputDies) {
+TEST_F(OpSliceCopyTensorOutTest, EmptySizeInputDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.ones({});
@@ -703,13 +705,17 @@ TEST(OpSliceCopyTensorOutTest, EmptySizeInputDies) {
   Tensor out = tf.ones({});
 
   // The operation shall die whatever the end is.
-  ET_EXPECT_KERNEL_FAILURE(op_slice_scatter_out(
-      input, src, /*dim=*/0, /*start=*/0, /*end=*/0, /*step=*/1, out));
-  ET_EXPECT_KERNEL_FAILURE(op_slice_scatter_out(
-      input, src, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_slice_scatter_out(
+          input, src, /*dim=*/0, /*start=*/0, /*end=*/0, /*step=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_slice_scatter_out(
+          input, src, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out));
 }
 
-TEST(OpSliceCopyTensorOutTest, NonPostiveStepsDies) {
+TEST_F(OpSliceCopyTensorOutTest, NonPostiveStepsDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.ones({1, 1, 1});
@@ -719,12 +725,14 @@ TEST(OpSliceCopyTensorOutTest, NonPostiveStepsDies) {
   // Some invalid step values.
   const std::vector<int64_t> invalid_steps = {-2, -1, 0};
   for (int64_t step : invalid_steps) {
-    ET_EXPECT_KERNEL_FAILURE(op_slice_scatter_out(
-        input, src, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/step, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_slice_scatter_out(
+            input, src, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/step, out));
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, DimOutOfBoundDies) {
+TEST_F(OpSliceCopyTensorOutTest, DimOutOfBoundDies) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.ones({1, 1, 1});
@@ -734,12 +742,14 @@ TEST(OpSliceCopyTensorOutTest, DimOutOfBoundDies) {
   // Some invalid dim values.
   const std::vector<int64_t> invalid_dims = {3, 4, 5, -4, -5, -6};
   for (int64_t dim : invalid_dims) {
-    ET_EXPECT_KERNEL_FAILURE(op_slice_scatter_out(
-        input, src, dim, /*start=*/0, /*end=*/1, /*step=*/1, out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_slice_scatter_out(
+            input, src, dim, /*start=*/0, /*end=*/1, /*step=*/1, out));
   }
 }
 
-TEST(OpSliceCopyTensorOutTest, MismatchedOutDtypesDies) {
+TEST_F(OpSliceCopyTensorOutTest, MismatchedOutDtypesDies) {
   TensorFactory<ScalarType::Int> tf_int;
   TensorFactory<ScalarType::Float> tf_float;
   Tensor input = tf_int.zeros({1, 2, 2});
@@ -748,11 +758,13 @@ TEST(OpSliceCopyTensorOutTest, MismatchedOutDtypesDies) {
   // Size is compatible to the output, but a mismatched dtype.
   Tensor out = tf_float.ones({1, 2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_slice_scatter_out(
-      input, src, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_slice_scatter_out(
+          input, src, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out));
 }
 
-TEST(OpSliceCopyTensorOutTest, OutSizeMismatchDimDies) {
+TEST_F(OpSliceCopyTensorOutTest, OutSizeMismatchDimDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -764,11 +776,13 @@ TEST(OpSliceCopyTensorOutTest, OutSizeMismatchDimDies) {
   // Should be {2, 4, 7, 5}
   Tensor out = tf.zeros({2, 4, 7});
 
-  ET_EXPECT_KERNEL_FAILURE(op_slice_scatter_out(
-      input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_slice_scatter_out(
+          input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out));
 }
 
-TEST(OpSliceCopyTensorOutTest, SrcSizeMismatchDimDies) {
+TEST_F(OpSliceCopyTensorOutTest, SrcSizeMismatchDimDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -780,11 +794,13 @@ TEST(OpSliceCopyTensorOutTest, SrcSizeMismatchDimDies) {
   // Should be {2, 4, 7, 5}
   Tensor out = tf.zeros({2, 4, 7, 5});
 
-  ET_EXPECT_KERNEL_FAILURE(op_slice_scatter_out(
-      input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_slice_scatter_out(
+          input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out));
 }
 
-TEST(OpSliceCopyTensorOutTest, DefaultStartValSupported) {
+TEST_F(OpSliceCopyTensorOutTest, DefaultStartValSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.zeros({2, 4, 7, 5});
@@ -805,7 +821,7 @@ TEST(OpSliceCopyTensorOutTest, DefaultStartValSupported) {
   EXPECT_TENSOR_EQ(ret_default_start, expected);
 }
 
-TEST(OpSliceCopyTensorOutTest, DefaultEndValSupported) {
+TEST_F(OpSliceCopyTensorOutTest, DefaultEndValSupported) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.zeros({2, 4, 7, 5});
@@ -826,7 +842,7 @@ TEST(OpSliceCopyTensorOutTest, DefaultEndValSupported) {
   EXPECT_TENSOR_EQ(ret_default_end, expected);
 }
 
-TEST(OpSliceCopyTensorOutTest, DynamicShapeTest) {
+TEST_F(OpSliceCopyTensorOutTest, DynamicShapeTest) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor input = tf.zeros({1, 4, 4});
diff --git a/kernels/test/op_softmax_test.cpp b/kernels/test/op_softmax_test.cpp
index 419d797d02..e3eaac6762 100644
--- a/kernels/test/op_softmax_test.cpp
+++ b/kernels/test/op_softmax_test.cpp
@@ -22,17 +22,50 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_softmax_out(
-    const Tensor& self,
-    int64_t dim,
-    bool half_to_float,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::_softmax_outf(
-      context, self, dim, half_to_float, out);
-}
+class OpSoftmaxOutTest : public OperatorTest {
+ protected:
+  Tensor& op_softmax_out(
+      const Tensor& self,
+      int64_t dim,
+      bool half_to_float,
+      Tensor& out) {
+    return torch::executor::aten::_softmax_outf(
+        context_, self, dim, half_to_float, out);
+  }
+
+  // A generic smoke test that works for the supported dtypes.
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    // Input tensor with shape (2, 3) and values (0, 1, 2, 3, 4, 5).
+    // clang-format off
+    Tensor x = tf.make(
+      {2, 3},
+      {
+        0, 1, 2,
+        3, 4, 5
+      });
+    // clang-format on
+
+    Tensor out = tf.zeros({2, 3});
+
+    op_softmax_out(x, /*dim=*/1, /*half_to_float*/ false, out);
+
+    // clang-format off
+    Tensor expected = tf.make(
+      {2, 3},
+      {
+        0.0900306, 0.244728, 0.665241,
+        0.0900306, 0.244728, 0.665241
+      });
+    // clang-format on
+
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
+};
 
-TEST(OpSoftmaxOutTest, Smoke) {
+TEST_F(OpSoftmaxOutTest, Smoke) {
   TensorFactory<ScalarType::Float> tff;
   std::vector<int32_t> sizes = {1, 3};
   Tensor in = tff.make(sizes, {0, 1, 2});
@@ -49,7 +82,7 @@ TEST(OpSoftmaxOutTest, Smoke) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST(OpSoftmaxOutTest, HalfSupport) {
+TEST_F(OpSoftmaxOutTest, HalfSupport) {
   TensorFactory<ScalarType::Half> tfh;
   std::vector<int32_t> sizes = {1, 4};
   Tensor in = tfh.ones(sizes);
@@ -66,38 +99,7 @@ TEST(OpSoftmaxOutTest, HalfSupport) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-/// A generic smoke test that works for the supported dtypes.
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  // Input tensor with shape (2, 3) and values (0, 1, 2, 3, 4, 5).
-  // clang-format off
-  Tensor x = tf.make(
-    {2, 3},
-    {
-      0, 1, 2,
-      3, 4, 5
-    });
-  // clang-format on
-
-  Tensor out = tf.zeros({2, 3});
-
-  op_softmax_out(x, /*dim=*/1, /*half_to_float*/ false, out);
-
-  // clang-format off
-  Tensor expected = tf.make(
-    {2, 3},
-    {
-      0.0900306, 0.244728, 0.665241,
-      0.0900306, 0.244728, 0.665241
-    });
-  // clang-format on
-
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
-TEST(OpSoftmaxOutTest, AllDtypesSupported) {
+TEST_F(OpSoftmaxOutTest, AllDtypesSupported) {
   test_dtype<float, ScalarType::Float>();
   test_dtype<double, ScalarType::Double>();
   // TODO: Also add tests for half, complex, quantized, and other types. Easiest
@@ -105,7 +107,7 @@ TEST(OpSoftmaxOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpSoftmaxOutTest, MismatchedDimensionsDies) {
+TEST_F(OpSoftmaxOutTest, MismatchedDimensionsDies) {
   TensorFactory<ScalarType::Float> tff;
 
   // Input tensor with shape (1, 3) and values (0, 1, 2).
@@ -116,10 +118,10 @@ TEST(OpSoftmaxOutTest, MismatchedDimensionsDies) {
 
   // Dim out of bounds
   ET_EXPECT_KERNEL_FAILURE(
-      op_softmax_out(x, /*dim=*/3, /*half_to_float*/ false, out));
+      context_, op_softmax_out(x, /*dim=*/3, /*half_to_float*/ false, out));
 }
 
-TEST(OpSoftmaxOutTest, MismatchedDimensionSizeDies) {
+TEST_F(OpSoftmaxOutTest, MismatchedDimensionSizeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched dimension size";
   }
@@ -131,10 +133,11 @@ TEST(OpSoftmaxOutTest, MismatchedDimensionSizeDies) {
   Tensor wrong_out = tf.zeros({2, 10, 4});
 
   ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_softmax_out(x, /*dim=*/1, /*half_to_float*/ false, wrong_out));
 }
 
-TEST(OpSoftmaxOutTest, NegativeDim) {
+TEST_F(OpSoftmaxOutTest, NegativeDim) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -184,7 +187,7 @@ TEST(OpSoftmaxOutTest, NegativeDim) {
   EXPECT_TENSOR_CLOSE(out_negative_dim, expected);
 }
 
-TEST(OpSoftmaxOutTest, SimpleGeneratedCase) {
+TEST_F(OpSoftmaxOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -238,7 +241,7 @@ TEST(OpSoftmaxOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -264,7 +267,7 @@ TEST(OpSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -290,7 +293,7 @@ TEST(OpSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSoftmaxOutTest, DynamicShapeUnbound) {
+TEST_F(OpSoftmaxOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_split_copy_test.cpp b/kernels/test/op_split_copy_test.cpp
index b25dba0404..cc458d9dfb 100644
--- a/kernels/test/op_split_copy_test.cpp
+++ b/kernels/test/op_split_copy_test.cpp
@@ -24,43 +24,104 @@ using exec_aten::TensorList;
 using torch::executor::testing::TensorFactory;
 using torch::executor::testing::TensorListFactory;
 
-void op_split_copy_tensor_out(
-    const Tensor& self,
-    int64_t split_size,
-    int64_t dim,
-    TensorList out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::split_copy_outf(
-      context, self, split_size, dim, out);
-}
+class OpSplitCopyTensorOutTest : public OperatorTest {
+ protected:
+  void op_split_copy_tensor_out(
+      const Tensor& self,
+      int64_t split_size,
+      int64_t dim,
+      TensorList out) {
+    return torch::executor::aten::split_copy_outf(
+        context_, self, split_size, dim, out);
+  }
+
+  template <ScalarType DTYPE>
+  Tensor make3x3x3(TensorFactory<DTYPE>& tf) {
+    // clang-format off
+    return tf.make(
+        /*sizes=*/{3, 3, 3},
+        /*data=*/
+        {
+             0,  1,  2, // tensor([[[ 0,  1,  2],
+             3,  4,  5, //          [ 3,  4,  5],
+             6,  7,  8, //          [ 6,  7,  8]],
+
+             9, 10, 11, //         [[ 9, 10, 11],
+            12, 13, 14, //          [12, 13, 14],
+            15, 16, 17, //          [15, 16, 17]],
+
+            18, 19, 20, //         [[18, 19, 20],
+            21, 22, 23, //          [21, 22, 23],
+            24, 25, 26, //          [24, 25, 26]]])
+        });
+    // clang-format on
+  }
+
+  // A simple successful test case that will work for any real dtype and bool.
+  template <ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+    TensorListFactory<DTYPE> tlf;
+
+    Tensor input = tf.make(/*sizes=*/{2, 2}, /*data=*/{1, 0, 0, 1});
+
+    std::vector<Tensor> expected_out = {
+        tf.make(/*sizes=*/{1, 2}, /*data=*/{1, 0}),
+        tf.make(/*sizes=*/{1, 2}, /*data=*/{0, 1}),
+    };
+    TensorList out = tlf.zeros_like(expected_out);
+
+    op_split_copy_tensor_out(input, /*split_size=*/1, /*dim=*/0, out);
+
+    EXPECT_TENSOR_LISTS_EQ(out, expected_out);
+  }
+
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  x = torch.randint(10, (2, 9))
+  res = torch.split(x, 3, 1)
+  op = "op_split_copy_tensor_out"
+  opt_extra_params = "3, 1,"
+  out_args = [
+    "out_shape, dynamism",
+    "out_shape, dynamism",
+    "out_shape, dynamism"
+  ]
+  dtype = "ScalarType::Int"
+  check = "EXPECT_TENSOR_LISTS_EQ" */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(unary_op_tensor_list_out) */
+
+    TensorFactory<ScalarType::Int> tf;
+
+    Tensor x =
+        tf.make({2, 9}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6, 6, 9, 8, 6, 6, 8});
+    std::vector<Tensor> expectedv = {
+        tf.make({2, 3}, {4, 9, 3, 3, 1, 6}),
+        tf.make({2, 3}, {0, 3, 9, 6, 9, 8}),
+        tf.make({2, 3}, {7, 3, 7, 6, 6, 8})};
+    TensorList expected(expectedv.data(), expectedv.size());
+
+    std::vector<Tensor> outv = {
+        tf.zeros(out_shape, dynamism),
+        tf.zeros(out_shape, dynamism),
+        tf.zeros(out_shape, dynamism)};
+    TensorList out(outv.data(), outv.size());
+    op_split_copy_tensor_out(x, 3, 1, out);
+    EXPECT_TENSOR_LISTS_EQ(out, expected);
+  }
+};
 
 /**
  * Returns a 3x3x3 contiguous tensor where the underlying data counts from 0 to
  * 26.
  */
-template <ScalarType DTYPE>
-Tensor make3x3x3(TensorFactory<DTYPE>& tf) {
-  // clang-format off
-  return tf.make(
-      /*sizes=*/{3, 3, 3},
-      /*data=*/
-      {
-           0,  1,  2, // tensor([[[ 0,  1,  2],
-           3,  4,  5, //          [ 3,  4,  5],
-           6,  7,  8, //          [ 6,  7,  8]],
-
-           9, 10, 11, //         [[ 9, 10, 11],
-          12, 13, 14, //          [12, 13, 14],
-          15, 16, 17, //          [15, 16, 17]],
-
-          18, 19, 20, //         [[18, 19, 20],
-          21, 22, 23, //          [21, 22, 23],
-          24, 25, 26, //          [24, 25, 26]]])
-      });
-  // clang-format on
-}
-
-TEST(OpSplitCopyTensorOutTest, Split3x3x3OnDim0) {
+TEST_F(OpSplitCopyTensorOutTest, Split3x3x3OnDim0) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -107,7 +168,7 @@ TEST(OpSplitCopyTensorOutTest, Split3x3x3OnDim0) {
   EXPECT_TENSOR_LISTS_EQ(expected_out, out2);
 }
 
-TEST(OpSplitCopyTensorOutTest, Split3x3x3OnDim1) {
+TEST_F(OpSplitCopyTensorOutTest, Split3x3x3OnDim1) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -157,7 +218,7 @@ TEST(OpSplitCopyTensorOutTest, Split3x3x3OnDim1) {
   EXPECT_TENSOR_LISTS_EQ(expected_out, out2);
 }
 
-TEST(OpSplitCopyTensorOutTest, Split3x3x3OnDim2) {
+TEST_F(OpSplitCopyTensorOutTest, Split3x3x3OnDim2) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -216,7 +277,7 @@ TEST(OpSplitCopyTensorOutTest, Split3x3x3OnDim2) {
   EXPECT_TENSOR_LISTS_EQ(expected_out, out2);
 }
 
-TEST(OpSplitCopyTensorOutTest, LargerSplitSizeDoesNothing) {
+TEST_F(OpSplitCopyTensorOutTest, LargerSplitSizeDoesNothing) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -235,26 +296,7 @@ TEST(OpSplitCopyTensorOutTest, LargerSplitSizeDoesNothing) {
   }
 }
 
-/// A simple successful test case that will work for any real dtype and bool.
-template <ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-  TensorListFactory<DTYPE> tlf;
-
-  Tensor input = tf.make(/*sizes=*/{2, 2}, /*data=*/{1, 0, 0, 1});
-
-  std::vector<Tensor> expected_out = {
-      tf.make(/*sizes=*/{1, 2}, /*data=*/{1, 0}),
-      tf.make(/*sizes=*/{1, 2}, /*data=*/{0, 1}),
-  };
-  TensorList out = tlf.zeros_like(expected_out);
-
-  op_split_copy_tensor_out(input, /*split_size=*/1, /*dim=*/0, out);
-
-  EXPECT_TENSOR_LISTS_EQ(out, expected_out);
-}
-
-TEST(OpSplitCopyTensorOutTest, AllDtypesSupported) {
+TEST_F(OpSplitCopyTensorOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
@@ -263,7 +305,7 @@ TEST(OpSplitCopyTensorOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpSplitCopyTensorOutTest, EmptyInputTensor) {
+TEST_F(OpSplitCopyTensorOutTest, EmptyInputTensor) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -280,7 +322,7 @@ TEST(OpSplitCopyTensorOutTest, EmptyInputTensor) {
   }
 }
 
-TEST(OpSplitCopyTensorOutTest, ZeroDimensionalInputTensorDies) {
+TEST_F(OpSplitCopyTensorOutTest, ZeroDimensionalInputTensorDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -289,10 +331,11 @@ TEST(OpSplitCopyTensorOutTest, ZeroDimensionalInputTensorDies) {
   TensorList out = tlf.zeros_like({input});
 
   ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_split_copy_tensor_out(input, /*split_size=*/1, /*dim=*/0, out));
 }
 
-TEST(OpSplitCopyTensorOutTest, ZeroSplitSizeOnlyWorksForZeroSizeDims) {
+TEST_F(OpSplitCopyTensorOutTest, ZeroSplitSizeOnlyWorksForZeroSizeDims) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -305,6 +348,7 @@ TEST(OpSplitCopyTensorOutTest, ZeroSplitSizeOnlyWorksForZeroSizeDims) {
 
   // Fails when trying to split with size zero on a dim with size > 0.
   ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_split_copy_tensor_out(input, /*split_size=*/0, /*dim=*/0, out));
 
   // Successfully splits with size zero on a dim with size == 0.
@@ -313,10 +357,11 @@ TEST(OpSplitCopyTensorOutTest, ZeroSplitSizeOnlyWorksForZeroSizeDims) {
 
   // Fails again when trying to split with size zero on a dim with size > 0.
   ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_split_copy_tensor_out(input, /*split_size=*/0, /*dim=*/2, out));
 }
 
-TEST(OpSplitCopyTensorOutTest, NegativeSplitSizeFails) {
+TEST_F(OpSplitCopyTensorOutTest, NegativeSplitSizeFails) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -325,10 +370,11 @@ TEST(OpSplitCopyTensorOutTest, NegativeSplitSizeFails) {
   TensorList out = tlf.zeros_like({input});
 
   ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_split_copy_tensor_out(input, /*split_size=*/-1, /*dim=*/0, out));
 }
 
-TEST(OpSplitCopyTensorOutTest, OutOfRangeDimsDie) {
+TEST_F(OpSplitCopyTensorOutTest, OutOfRangeDimsDie) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -351,11 +397,11 @@ TEST(OpSplitCopyTensorOutTest, OutOfRangeDimsDie) {
   for (auto dim : bad_dims) {
     TensorList out = tlf.zeros_like({input});
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
 }
 
-TEST(OpSplitCopyTensorOutTest, DtypeMismatchDies) {
+TEST_F(OpSplitCopyTensorOutTest, DtypeMismatchDies) {
   GTEST_SKIP() << "ATen kernel can handle dtype mismatch";
   TensorFactory<ScalarType::Int> tf_int;
   TensorListFactory<ScalarType::Int> tlf_int;
@@ -378,11 +424,11 @@ TEST(OpSplitCopyTensorOutTest, DtypeMismatchDies) {
   {
     TensorList out = tlf_float.zeros_like({input});
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
 }
 
-TEST(OpSplitCopyTensorOutTest, WrongNumOutputEntriesDies) {
+TEST_F(OpSplitCopyTensorOutTest, WrongNumOutputEntriesDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -411,7 +457,7 @@ TEST(OpSplitCopyTensorOutTest, WrongNumOutputEntriesDies) {
     };
     TensorList out = tlf.zeros_like(incorrect_out);
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
 
   // Dies with the same setup but the output has one more entry than it should.
@@ -423,11 +469,11 @@ TEST(OpSplitCopyTensorOutTest, WrongNumOutputEntriesDies) {
     };
     TensorList out = tlf.zeros_like(incorrect_out);
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
 }
 
-TEST(OpSplitCopyTensorOutTest, WrongOutputShapeDies) {
+TEST_F(OpSplitCopyTensorOutTest, WrongOutputShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle wrong out shape";
   }
@@ -459,7 +505,7 @@ TEST(OpSplitCopyTensorOutTest, WrongOutputShapeDies) {
     };
     TensorList out = tlf.zeros_like(incorrect_out);
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
   {
     std::vector<Tensor> incorrect_out = {
@@ -468,7 +514,7 @@ TEST(OpSplitCopyTensorOutTest, WrongOutputShapeDies) {
     };
     TensorList out = tlf.zeros_like(incorrect_out);
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
   {
     std::vector<Tensor> incorrect_out = {
@@ -477,7 +523,7 @@ TEST(OpSplitCopyTensorOutTest, WrongOutputShapeDies) {
     };
     TensorList out = tlf.zeros_like(incorrect_out);
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
 
   // Wrong size of the split dimension for the non-last output element.
@@ -488,7 +534,7 @@ TEST(OpSplitCopyTensorOutTest, WrongOutputShapeDies) {
     };
     TensorList out = tlf.zeros_like(incorrect_out);
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
 
   // Wrong number of output dimensions.
@@ -499,7 +545,7 @@ TEST(OpSplitCopyTensorOutTest, WrongOutputShapeDies) {
     };
     TensorList out = tlf.zeros_like(incorrect_out);
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
   {
     std::vector<Tensor> incorrect_out = {
@@ -508,62 +554,22 @@ TEST(OpSplitCopyTensorOutTest, WrongOutputShapeDies) {
     };
     TensorList out = tlf.zeros_like(incorrect_out);
     ET_EXPECT_KERNEL_FAILURE(
-        op_split_copy_tensor_out(input, split_size, dim, out));
+        context_, op_split_copy_tensor_out(input, split_size, dim, out));
   }
 }
 
-/* %python
-import torch
-torch.manual_seed(0)
-x = torch.randint(10, (2, 9))
-res = torch.split(x, 3, 1)
-op = "op_split_copy_tensor_out"
-opt_extra_params = "3, 1,"
-out_args = [
-  "out_shape, dynamism",
-  "out_shape, dynamism",
-  "out_shape, dynamism"
-]
-dtype = "ScalarType::Int"
-check = "EXPECT_TENSOR_LISTS_EQ" */
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(unary_op_tensor_list_out) */
-
-  TensorFactory<ScalarType::Int> tf;
-
-  Tensor x =
-      tf.make({2, 9}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6, 6, 9, 8, 6, 6, 8});
-  std::vector<Tensor> expectedv = {
-      tf.make({2, 3}, {4, 9, 3, 3, 1, 6}),
-      tf.make({2, 3}, {0, 3, 9, 6, 9, 8}),
-      tf.make({2, 3}, {7, 3, 7, 6, 6, 8})};
-  TensorList expected(expectedv.data(), expectedv.size());
-
-  std::vector<Tensor> outv = {
-      tf.zeros(out_shape, dynamism),
-      tf.zeros(out_shape, dynamism),
-      tf.zeros(out_shape, dynamism)};
-  TensorList out(outv.data(), outv.size());
-  op_split_copy_tensor_out(x, 3, 1, out);
-  EXPECT_TENSOR_LISTS_EQ(out, expected);
-}
-
-TEST(OpSplitCopyTensorOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpSplitCopyTensorOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpSplitCopyTensorOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpSplitCopyTensorOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   GTEST_SKIP() << "Dynamic shape not supported";
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpSplitCopyTensorOutTest, DynamicShapeUnbound) {
+TEST_F(OpSplitCopyTensorOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   test_dynamic_shape(
       {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
diff --git a/kernels/test/op_split_with_sizes_copy_test.cpp b/kernels/test/op_split_with_sizes_copy_test.cpp
index 1941ca4b69..f67789c356 100644
--- a/kernels/test/op_split_with_sizes_copy_test.cpp
+++ b/kernels/test/op_split_with_sizes_copy_test.cpp
@@ -17,17 +17,19 @@
 
 using namespace ::testing;
 
-void op_split_with_sizes_copy_out(
-    const exec_aten::Tensor& self,
-    exec_aten::ArrayRef<int64_t> split_sizes,
-    int64_t dim,
-    exec_aten::TensorList out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::split_with_sizes_copy_outf(
-      context, self, split_sizes, dim, out);
-}
+class OpSplitWithSizesCopyOutTest : public OperatorTest {
+ protected:
+  void op_split_with_sizes_copy_out(
+      const exec_aten::Tensor& self,
+      exec_aten::ArrayRef<int64_t> split_sizes,
+      int64_t dim,
+      exec_aten::TensorList out) {
+    return torch::executor::aten::split_with_sizes_copy_outf(
+        context_, self, split_sizes, dim, out);
+  }
+};
 
-TEST(OpSplitWithSizesCopyOutTest, SanityCheckDim1) {
+TEST_F(OpSplitWithSizesCopyOutTest, SanityCheckDim1) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor self = tfFloat.make(
diff --git a/kernels/test/op_sqrt_test.cpp b/kernels/test/op_sqrt_test.cpp
index e649d3bfbf..f14f345a8c 100644
--- a/kernels/test/op_sqrt_test.cpp
+++ b/kernels/test/op_sqrt_test.cpp
@@ -20,12 +20,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_sqrt_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::sqrt_outf(context, self, out);
-}
+class OpSqrtTest : public OperatorTest {
+ protected:
+  Tensor& op_sqrt_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::sqrt_outf(context_, self, out);
+  }
+};
 
-TEST(OpSqrtTest, SanityCheck) {
+TEST_F(OpSqrtTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor in = tf.make({1, 7}, {-9., -2., -1., 0., 1., 2., 9.});
@@ -40,7 +42,7 @@ TEST(OpSqrtTest, SanityCheck) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
-TEST(OpSqrtTest, HandleBoolInput) {
+TEST_F(OpSqrtTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -53,7 +55,7 @@ TEST(OpSqrtTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_sqrt_out(a, out), res);
 }
 
-TEST(OpSqrtTest, HandleHalfInput) {
+TEST_F(OpSqrtTest, HandleHalfInput) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
diff --git a/kernels/test/op_squeeze_copy_test.cpp b/kernels/test/op_squeeze_copy_test.cpp
index dfabb0498c..68fc241fc4 100644
--- a/kernels/test/op_squeeze_copy_test.cpp
+++ b/kernels/test/op_squeeze_copy_test.cpp
@@ -21,32 +21,37 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_squeeze_copy_dim_out(const Tensor& self, int64_t dim, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::squeeze_copy_outf(context, self, dim, out);
-}
-
-Tensor& op_squeeze_copy_dims_out(
-    const Tensor& self,
-    exec_aten::ArrayRef<int64_t> dims,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::squeeze_copy_outf(context, self, dims, out);
-}
+class OpSqueezeTest : public OperatorTest {
+ protected:
+  Tensor&
+  op_squeeze_copy_dim_out(const Tensor& self, int64_t dim, Tensor& out) {
+    return torch::executor::aten::squeeze_copy_outf(context_, self, dim, out);
+  }
+};
+
+class OpSqueezeCopyDimsOutTest : public OperatorTest {
+ protected:
+  Tensor& op_squeeze_copy_dims_out(
+      const Tensor& self,
+      exec_aten::ArrayRef<int64_t> dims,
+      Tensor& out) {
+    return torch::executor::aten::squeeze_copy_outf(context_, self, dims, out);
+  }
+};
 
 namespace {
 
-TEST(OpSqueezeKernelTest, DTypesMismatchDies) {
+TEST_F(OpSqueezeTest, DTypesMismatchDies) {
   TensorFactory<ScalarType::Int> tf_int;
   TensorFactory<ScalarType::Double> tf_d;
   Tensor t_in = tf_int.ones({2});
   Tensor t_out = tf_d.ones({2});
   int64_t dim = 0;
 
-  ET_EXPECT_KERNEL_FAILURE(op_squeeze_copy_dim_out(t_in, dim, t_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_squeeze_copy_dim_out(t_in, dim, t_out));
 }
 
-TEST(OpSqueezeKernelTest, 0DTensorSqueeze) {
+TEST_F(OpSqueezeTest, 0DTensorSqueeze) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.ones({});
   Tensor t_out = tf.zeros({});
@@ -58,16 +63,16 @@ TEST(OpSqueezeKernelTest, 0DTensorSqueeze) {
   EXPECT_TENSOR_DATA_EQ(t_expected, t_out);
 }
 
-TEST(OpSqueezeKernelTest, 0DTensorSqueezeInvalidDim1Dies) {
+TEST_F(OpSqueezeTest, 0DTensorSqueezeInvalidDim1Dies) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.ones({});
   Tensor t_out = tf.ones({});
   int64_t dim = 1;
 
-  ET_EXPECT_KERNEL_FAILURE(op_squeeze_copy_dim_out(t_in, dim, t_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_squeeze_copy_dim_out(t_in, dim, t_out));
 }
 
-TEST(OpSqueezeKernelTest, 1DTensorSqueezeTo0D) {
+TEST_F(OpSqueezeTest, 1DTensorSqueezeTo0D) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.ones({1});
   Tensor t_out = tf.make({}, {99});
@@ -79,7 +84,7 @@ TEST(OpSqueezeKernelTest, 1DTensorSqueezeTo0D) {
   EXPECT_TENSOR_DATA_EQ(t_expected, t_out);
 }
 
-TEST(OpSqueezeKernelTest, 2DTensorSqueezeUnchange) {
+TEST_F(OpSqueezeTest, 2DTensorSqueezeUnchange) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.ones({2, 1});
   Tensor t_out = tf.make({2, 1}, {4, 3});
@@ -91,7 +96,7 @@ TEST(OpSqueezeKernelTest, 2DTensorSqueezeUnchange) {
   EXPECT_TENSOR_DATA_EQ(t_expected, t_out);
 }
 
-TEST(OpSqueezeKernelTest, 2DTensorSqueezeTo1D) {
+TEST_F(OpSqueezeTest, 2DTensorSqueezeTo1D) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.ones({2, 1});
   Tensor t_out = tf.make({2}, {4, 3});
@@ -104,7 +109,7 @@ TEST(OpSqueezeKernelTest, 2DTensorSqueezeTo1D) {
 }
 
 #ifndef USE_ATEN_LIB
-TEST(OpSqueezeKernelTest, 2DTensorSqueezeDownwardDimResizeOut) {
+TEST_F(OpSqueezeTest, 2DTensorSqueezeDownwardDimResizeOut) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.ones({2, 1});
   Tensor t_out = tf.zeros(
@@ -119,7 +124,7 @@ TEST(OpSqueezeKernelTest, 2DTensorSqueezeDownwardDimResizeOut) {
   EXPECT_TENSOR_DATA_EQ(t_expected, t_out);
 }
 
-TEST(OpSqueezeKernelTest, 2DTensorSqueezeUpwardDimResizeOutDie) {
+TEST_F(OpSqueezeTest, 2DTensorSqueezeUpwardDimResizeOutDie) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.ones({2, 1});
   Tensor t_out = tf.zeros(
@@ -129,10 +134,10 @@ TEST(OpSqueezeKernelTest, 2DTensorSqueezeUpwardDimResizeOutDie) {
   Tensor t_expected = tf.ones({2, 1});
   int64_t dim = 0;
 
-  ET_EXPECT_KERNEL_FAILURE(op_squeeze_copy_dim_out(t_in, dim, t_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_squeeze_copy_dim_out(t_in, dim, t_out));
 }
 
-TEST(OpSqueezeKernelTest, 2DTensorSqueezeRemoveADimResizeOutDie) {
+TEST_F(OpSqueezeTest, 2DTensorSqueezeRemoveADimResizeOutDie) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.ones({2, 1});
   Tensor t_out = tf.zeros(
@@ -142,10 +147,10 @@ TEST(OpSqueezeKernelTest, 2DTensorSqueezeRemoveADimResizeOutDie) {
   Tensor t_expected = tf.ones({2, 1});
   int64_t dim = 0;
 
-  ET_EXPECT_KERNEL_FAILURE(op_squeeze_copy_dim_out(t_in, dim, t_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_squeeze_copy_dim_out(t_in, dim, t_out));
 }
 
-TEST(OpSqueezeKernelTest, 2DTensorSqueezeAddDimsResizeOutDie) {
+TEST_F(OpSqueezeTest, 2DTensorSqueezeAddDimsResizeOutDie) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.ones({2, 1});
   Tensor t_out = tf.zeros(
@@ -155,11 +160,11 @@ TEST(OpSqueezeKernelTest, 2DTensorSqueezeAddDimsResizeOutDie) {
   Tensor t_expected = tf.ones({2, 1});
   int64_t dim = 0;
 
-  ET_EXPECT_KERNEL_FAILURE(op_squeeze_copy_dim_out(t_in, dim, t_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_squeeze_copy_dim_out(t_in, dim, t_out));
 }
 #endif
 
-TEST(OpSqueezeKernelTest, TensorSqueeze) {
+TEST_F(OpSqueezeTest, TensorSqueeze) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.make({3, 1, 2, 1}, {1, 2, 3, 4, 5, 6});
   Tensor t_out = tf.zeros({3, 2, 1});
@@ -171,7 +176,7 @@ TEST(OpSqueezeKernelTest, TensorSqueeze) {
   EXPECT_TENSOR_DATA_EQ(t_expected, t_out);
 }
 
-TEST(OpSqueezeKernelTest, TensorSqueezeNegativeDim) {
+TEST_F(OpSqueezeTest, TensorSqueezeNegativeDim) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.make({3, 1, 2, 1}, {1, 2, 3, 4, 5, 6});
   Tensor t_out = tf.zeros({3, 2, 1});
@@ -183,7 +188,7 @@ TEST(OpSqueezeKernelTest, TensorSqueezeNegativeDim) {
   EXPECT_TENSOR_DATA_EQ(t_expected, t_out);
 }
 
-TEST(OpSqueezeKernelTest, TensorSqueezeInvaidDim) {
+TEST_F(OpSqueezeTest, TensorSqueezeInvaidDim) {
   TensorFactory<ScalarType::Int> tf;
   Tensor t_in = tf.make({3, 1, 2, 1}, {1, 2, 3, 4, 5, 6});
   Tensor t_out = tf.zeros({3, 2, 1});
@@ -191,7 +196,8 @@ TEST(OpSqueezeKernelTest, TensorSqueezeInvaidDim) {
   std::vector<int64_t> invalid_dims = {t_in.dim(), -t_in.dim() - 1};
 
   for (const auto dim : invalid_dims) {
-    ET_EXPECT_KERNEL_FAILURE(op_squeeze_copy_dim_out(t_in, dim, t_out));
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_squeeze_copy_dim_out(t_in, dim, t_out));
   }
 }
 
@@ -205,7 +211,7 @@ opt_extra_params = "1,"
 dtype = "ScalarType::Float"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpSqueezeKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpSqueezeTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{2, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -239,7 +245,7 @@ TEST(OpSqueezeKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpSqueezeKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpSqueezeTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -276,7 +282,7 @@ TEST(OpSqueezeKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpSqueezeKernelTest, DynamicShapeUnbound) {
+TEST_F(OpSqueezeTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -315,7 +321,7 @@ TEST(OpSqueezeKernelTest, DynamicShapeUnbound) {
 
 } // namespace
 
-TEST(OpSqueezeCopyDimsOutTest, SanityTest4D) {
+TEST_F(OpSqueezeCopyDimsOutTest, SanityTest4D) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor self = tfFloat.make(
@@ -350,7 +356,7 @@ TEST(OpSqueezeCopyDimsOutTest, SanityTest4D) {
   EXPECT_TENSOR_CLOSE(out, out_expected);
 }
 
-TEST(OpSqueezeCopyDimsOutTest, SanityCheck5D) {
+TEST_F(OpSqueezeCopyDimsOutTest, SanityCheck5D) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor self = tfFloat.make(
@@ -375,7 +381,7 @@ TEST(OpSqueezeCopyDimsOutTest, SanityCheck5D) {
   EXPECT_TENSOR_CLOSE(out, out_expected);
 }
 
-TEST(OpSqueezeCopyDimsOutTest, SanityCheck5DUnchanged) {
+TEST_F(OpSqueezeCopyDimsOutTest, SanityCheck5DUnchanged) {
   torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor self = tfFloat.make(
diff --git a/kernels/test/op_stack_test.cpp b/kernels/test/op_stack_test.cpp
index 36e46d7177..e1a666306f 100644
--- a/kernels/test/op_stack_test.cpp
+++ b/kernels/test/op_stack_test.cpp
@@ -24,37 +24,78 @@ using exec_aten::Tensor;
 using exec_aten::TensorList;
 using torch::executor::testing::TensorFactory;
 
-namespace {
+class OpStackOutTest : public OperatorTest {
+ protected:
+  Tensor& op_stack_out(TensorList tensors, int64_t dim, Tensor& out) {
+    return torch::executor::aten::stack_outf(context_, tensors, dim, out);
+  }
 
-Tensor& op_stack_out(TensorList tensors, int64_t dim, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::stack_outf(context, tensors, dim, out);
-}
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    // Will be stackd along out.dim(1). Use different input values so we can see
+    // where each output value came from.
+    Tensor x = tf.ones({3, 4});
+    Tensor y = tf.zeros({3, 4});
+    std::vector<Tensor> inputs = {x, y};
+
+    Tensor out = tf.ones({3, 2, 4});
+    op_stack_out(
+        ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/1, out);
+
+    // The two tensors x and y  are stacked along the 1st dimension with the
+    // order [x, y], so the x and y should be equal to expected[:, 0, :] and
+    // expected[:, 1, :] e.g. expected[i, 0, j] = x[i, j] and expected[i, 1, j]
+    // = y[i, j] for any i in [-x.size(0), x.size(0)-1] and j in [-x.size(1),
+    // x.size(1)-1]
+    // clang-format off
+    Tensor expected = tf.make(
+        {3, 2, 4},
+        {
+          // all ones below are from x,
+          // and all zeros are from y.
+          // [0, :, :]
+          1, 1, 1, 1, // [0, 0, :]
+          0, 0, 0, 0, // [0, 1, :]
 
-// Running stacking experiments along given dim.
-void run_stack_tests(
-    const std::vector<Tensor>& inputs,
-    int64_t dim,
-    const Tensor& expected) {
-  ArrayRef<Tensor> inputs_array(inputs.data(), inputs.size());
+          // [1, :, :]
+          1, 1, 1, 1, // [1, 0, :]
+          0, 0, 0, 0, // [1, 1, :]
 
-  TensorFactory<ScalarType::Double> tf;
-  const std::vector<int32_t> out_size(
-      expected.sizes().begin(), expected.sizes().end());
-  Tensor out = tf.zeros(out_size);
+          // [2, :, :]
+          1, 1, 1, 1, // [2, 0, :]
+          0, 0, 0, 0, // [2, 1, :]
+        });
+    // clang-format on
 
-  // Should always return the provided out Tensor.
-  Tensor ret = op_stack_out(inputs_array, dim, out);
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
 
-  ret = op_stack_out(inputs_array, /*dim=*/dim - out.dim(), out);
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-} // namespace
+  // Running stacking experiments along given dim.
+  void run_stack_tests(
+      const std::vector<Tensor>& inputs,
+      int64_t dim,
+      const Tensor& expected) {
+    ArrayRef<Tensor> inputs_array(inputs.data(), inputs.size());
+
+    TensorFactory<ScalarType::Double> tf;
+    const std::vector<int32_t> out_size(
+        expected.sizes().begin(), expected.sizes().end());
+    Tensor out = tf.zeros(out_size);
+
+    // Should always return the provided out Tensor.
+    Tensor ret = op_stack_out(inputs_array, dim, out);
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(out, expected);
+
+    ret = op_stack_out(inputs_array, /*dim=*/dim - out.dim(), out);
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
 
-TEST(OpStackOutTest, InsertFront) {
+TEST_F(OpStackOutTest, InsertFront) {
   TensorFactory<ScalarType::Double> tf;
 
   // clang-format off
@@ -108,7 +149,7 @@ TEST(OpStackOutTest, InsertFront) {
   run_stack_tests(inputs, /*dim=*/0, expected);
 }
 
-TEST(OpStackOutTest, InsertMiddle) {
+TEST_F(OpStackOutTest, InsertMiddle) {
   TensorFactory<ScalarType::Double> tf;
 
   // Two tensors with same size. Stack them on multiple dimensions
@@ -165,7 +206,7 @@ TEST(OpStackOutTest, InsertMiddle) {
   run_stack_tests(inputs, /*dim=*/1, expected);
 }
 
-TEST(OpStackOutTest, InsertEnd) {
+TEST_F(OpStackOutTest, InsertEnd) {
   TensorFactory<ScalarType::Double> tf;
 
   // Two tensors with same size. Stack them on multiple dimensions
@@ -233,47 +274,7 @@ TEST(OpStackOutTest, InsertEnd) {
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-
-  // Will be stackd along out.dim(1). Use different input values so we can see
-  // where each output value came from.
-  Tensor x = tf.ones({3, 4});
-  Tensor y = tf.zeros({3, 4});
-  std::vector<Tensor> inputs = {x, y};
-
-  Tensor out = tf.ones({3, 2, 4});
-  op_stack_out(ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/1, out);
-
-  // The two tensors x and y  are stacked along the 1st dimension with the order
-  // [x, y], so the x and y should be equal to expected[:, 0, :] and expected[:,
-  // 1, :] e.g. expected[i, 0, j] = x[i, j] and expected[i, 1, j] = y[i, j] for
-  // any i in [-x.size(0), x.size(0)-1] and j in [-x.size(1), x.size(1)-1]
-  // clang-format off
-  Tensor expected = tf.make(
-      {3, 2, 4},
-      {
-        // all ones below are from x,
-        // and all zeros are from y.
-        // [0, :, :]
-        1, 1, 1, 1, // [0, 0, :]
-        0, 0, 0, 0, // [0, 1, :]
-
-        // [1, :, :]
-        1, 1, 1, 1, // [1, 0, :]
-        0, 0, 0, 0, // [1, 1, :]
-
-        // [2, :, :]
-        1, 1, 1, 1, // [2, 0, :]
-        0, 0, 0, 0, // [2, 1, :]
-      });
-  // clang-format on
-
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpStackOutTest, AllDtypesSupported) {
+TEST_F(OpStackOutTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
@@ -282,7 +283,7 @@ TEST(OpStackOutTest, AllDtypesSupported) {
   // for those types.
 }
 
-TEST(OpStackOutTest, NoInputTensorsWithEmptyOutTensorFails) {
+TEST_F(OpStackOutTest, NoInputTensorsWithEmptyOutTensorFails) {
   TensorFactory<ScalarType::Int> tf;
 
   // Make an empty out tensor and demonstrate that it's empty.
@@ -290,10 +291,11 @@ TEST(OpStackOutTest, NoInputTensorsWithEmptyOutTensorFails) {
   EXPECT_EQ(out.numel(), 0);
 
   // Pass an empty list of input tensors.
-  ET_EXPECT_KERNEL_FAILURE(op_stack_out(ArrayRef<Tensor>(), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_stack_out(ArrayRef<Tensor>(), /*dim=*/0, out));
 }
 
-TEST(OpStackOutTest, AllEmptyInputTensors) {
+TEST_F(OpStackOutTest, AllEmptyInputTensors) {
   TensorFactory<ScalarType::Int> tf;
 
   // Using empty tensors as input.
@@ -314,7 +316,7 @@ TEST(OpStackOutTest, AllEmptyInputTensors) {
   // empty_out is still a empty array
 }
 
-TEST(OpStackOutTest, DimOutOfBoundDies) {
+TEST_F(OpStackOutTest, DimOutOfBoundDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Stack a single tensor with size [1, 1]. The size of output would always be
@@ -327,11 +329,11 @@ TEST(OpStackOutTest, DimOutOfBoundDies) {
   // Some invalid dim values.
   const std::vector<int64_t> invalid_dims = {3, 4, 5, -4, -5, -6};
   for (int64_t dim : invalid_dims) {
-    ET_EXPECT_KERNEL_FAILURE(op_stack_out(inputs, dim, out));
+    ET_EXPECT_KERNEL_FAILURE(context_, op_stack_out(inputs, dim, out));
   }
 }
 
-TEST(OpStackOutTest, MismatchedDtypesDies) {
+TEST_F(OpStackOutTest, MismatchedDtypesDies) {
   TensorFactory<ScalarType::Int> tf_int;
   TensorFactory<ScalarType::Float> tf_float;
   Tensor out = tf_int.zeros({1, 2, 2});
@@ -339,11 +341,13 @@ TEST(OpStackOutTest, MismatchedDtypesDies) {
   // Size is compatible to the output, but a mismatched dtype.
   std::vector<Tensor> inputs = {tf_float.ones({2, 2})};
 
-  ET_EXPECT_KERNEL_FAILURE(op_stack_out(
-      ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_stack_out(
+          ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
 }
 
-TEST(OpStackOutTest, OutMatchNumelWithExtraDimAtEndDies) {
+TEST_F(OpStackOutTest, OutMatchNumelWithExtraDimAtEndDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -354,11 +358,13 @@ TEST(OpStackOutTest, OutMatchNumelWithExtraDimAtEndDies) {
   // should always one greater than input.dim())
   std::vector<Tensor> inputs = {tf.ones({2, 2})};
 
-  ET_EXPECT_KERNEL_FAILURE(op_stack_out(
-      ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_stack_out(
+          ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
 }
 
-TEST(OpStackOutTest, OutMatchNumelLackDimAtFrontDies) {
+TEST_F(OpStackOutTest, OutMatchNumelLackDimAtFrontDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -369,11 +375,13 @@ TEST(OpStackOutTest, OutMatchNumelLackDimAtFrontDies) {
   // should always one greater than input.dim())
   std::vector<Tensor> inputs = {tf.ones({2, 2})};
 
-  ET_EXPECT_KERNEL_FAILURE(op_stack_out(
-      ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_stack_out(
+          ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
 }
 
-TEST(OpStackOutTest, OutRegularMismatchDimDies) {
+TEST_F(OpStackOutTest, OutRegularMismatchDimDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle out with mismatched dimensions";
   }
@@ -387,8 +395,10 @@ TEST(OpStackOutTest, OutRegularMismatchDimDies) {
       tf.ones({2, 3}),
   };
 
-  ET_EXPECT_KERNEL_FAILURE(op_stack_out(
-      ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_stack_out(
+          ArrayRef<Tensor>(inputs.data(), inputs.size()), /*dim=*/0, out));
 }
 
 /* %python
@@ -404,7 +414,7 @@ opt_extra_params = "0,"
 dtype = "ScalarType::Int"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpStackOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpStackOutTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{4, 2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op_tensor_list_in) */
@@ -426,7 +436,7 @@ TEST(OpStackOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpStackOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpStackOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -451,7 +461,7 @@ TEST(OpStackOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpStackOutTest, DynamicShapeUnbound) {
+TEST_F(OpStackOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp
index de4e2f1c4e..81d44dfa17 100644
--- a/kernels/test/op_sub_test.cpp
+++ b/kernels/test/op_sub_test.cpp
@@ -22,115 +22,119 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_sub_out(
-    const Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::sub_outf(context, self, other, alpha, out);
-}
-
-Tensor& op_sub_scalar_out(
-    const Tensor& self,
-    const Scalar& other,
-    const Scalar& alpha,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::sub_outf(context, self, other, alpha, out);
-}
+class OpSubOutTest : public OperatorTest {
+ protected:
+  Tensor& op_sub_out(
+      const Tensor& self,
+      const Tensor& other,
+      const Scalar& alpha,
+      Tensor& out) {
+    return torch::executor::aten::sub_outf(context_, self, other, alpha, out);
+  }
 
-template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
-void test_sub() {
-  TensorFactory<DTYPE_A> tf_a;
-  TensorFactory<DTYPE_B> tf_b;
-  TensorFactory<DTYPE_OUT> tf_out;
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
+  void test_sub() {
+    TensorFactory<DTYPE_A> tf_a;
+    TensorFactory<DTYPE_B> tf_b;
+    TensorFactory<DTYPE_OUT> tf_out;
 
-  const std::vector<int32_t> sizes = {2, 2};
+    const std::vector<int32_t> sizes = {2, 2};
 
-  // Destination for the sum.
-  Tensor out = tf_out.zeros(sizes);
+    // Destination for the sum.
+    Tensor out = tf_out.zeros(sizes);
 
-  // sub two tensors.
-  op_sub_out(
-      tf_a.make(sizes, /*data=*/{1, 2, 4, 8}),
-      tf_b.ones(sizes),
-      /*alpha=*/1,
-      out);
+    // sub two tensors.
+    op_sub_out(
+        tf_a.make(sizes, /*data=*/{1, 2, 4, 8}),
+        tf_b.ones(sizes),
+        /*alpha=*/1,
+        out);
 
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{0, 1, 3, 7}));
-}
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{0, 1, 3, 7}));
+  }
 
-template <ScalarType DTYPE_A, ScalarType DTYPE_B>
-void test_sub_enumerate_out_types() {
-  test_sub<DTYPE_A, DTYPE_B, ScalarType::Half>();
-  test_sub<DTYPE_A, DTYPE_B, ScalarType::Float>();
-  test_sub<DTYPE_A, DTYPE_B, ScalarType::Double>();
-  // Integral out type is only allowed if both inputs are integral types
-  if (isIntegralType(DTYPE_A, false) && isIntegralType(DTYPE_B, false)) {
-    test_sub<DTYPE_A, DTYPE_B, ScalarType::Int>();
-    test_sub<DTYPE_A, DTYPE_B, ScalarType::Long>();
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B>
+  void test_sub_enumerate_out_types() {
+    test_sub<DTYPE_A, DTYPE_B, ScalarType::Half>();
+    test_sub<DTYPE_A, DTYPE_B, ScalarType::Float>();
+    test_sub<DTYPE_A, DTYPE_B, ScalarType::Double>();
+    // Integral out type is only allowed if both inputs are integral types
+    if (isIntegralType(DTYPE_A, false) && isIntegralType(DTYPE_B, false)) {
+      test_sub<DTYPE_A, DTYPE_B, ScalarType::Int>();
+      test_sub<DTYPE_A, DTYPE_B, ScalarType::Long>();
+    }
   }
-}
 
-template <ScalarType DTYPE_A>
-void test_sub_enumerate_b_types() {
+  template <ScalarType DTYPE_A>
+  void test_sub_enumerate_b_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_sub_enumerate_out_types<DTYPE_A, ScalarType::dtype>();
 
-  ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
+
+  // Common testing for substraction between two floating point Tensors.
+  template <ScalarType DTYPE>
+  void test_floating_point_sub_out() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {2, 2};
+
+    // Destination for the subtraction.
+    Tensor out = tf.zeros(sizes);
 
-void test_sub_enumerate_a_types() {
+    // Performs substraction on two tensors.
+    op_sub_out(
+        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        tf.ones(sizes),
+        /*alpha=*/1,
+        out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.1, 1.2, 3.4, 7.8}));
+  }
+
+  void test_sub_enumerate_a_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_sub_enumerate_b_types<ScalarType::dtype>();
 
-  ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
+};
+
+class OpSubScalarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_sub_scalar_out(
+      const Tensor& self,
+      const Scalar& other,
+      const Scalar& alpha,
+      Tensor& out) {
+    return torch::executor::aten::sub_outf(context_, self, other, alpha, out);
+  }
+};
 
 /**
  * Uses the function templates above to test all valid combinations of inputs
  * and output dtypes
  */
-TEST(OpSubOutKernelTest, AllRealDtypesSupported) {
+TEST_F(OpSubOutTest, AllRealDtypesSupported) {
   test_sub_enumerate_a_types();
 }
 
-// Common testing for substraction between two floating point Tensors.
-template <ScalarType DTYPE>
-void test_floating_point_sub_out() {
-  TensorFactory<DTYPE> tf;
-
-  const std::vector<int32_t> sizes = {2, 2};
-
-  // Destination for the subtraction.
-  Tensor out = tf.zeros(sizes);
-
-  // Performs substraction on two tensors.
-  op_sub_out(
-      tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
-      tf.ones(sizes),
-      /*alpha=*/1,
-      out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.1, 1.2, 3.4, 7.8}));
-}
-
-TEST(OpSubOutKernelTest, FloatTensors) {
+TEST_F(OpSubOutTest, FloatTensors) {
   test_floating_point_sub_out<ScalarType::Float>();
 }
 
-TEST(OpSubOutKernelTest, DoubleTensors) {
+TEST_F(OpSubOutTest, DoubleTensors) {
   test_floating_point_sub_out<ScalarType::Double>();
 }
 
-TEST(OpSubOutKernelTest, BroadcastSupported) {
+TEST_F(OpSubOutTest, BroadcastSupported) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.make({2, 1, 2, 1}, {7, 8, 9, 10});
@@ -148,7 +152,7 @@ TEST(OpSubOutKernelTest, BroadcastSupported) {
   EXPECT_TENSOR_EQ(out, ref);
 }
 
-TEST(OpSubOutKernelTest, BroadcastSupported2) {
+TEST_F(OpSubOutTest, BroadcastSupported2) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.make({3, 2, 1}, {2, 3, 4, 5, 6, 7});
@@ -164,7 +168,7 @@ TEST(OpSubOutKernelTest, BroadcastSupported2) {
   EXPECT_TENSOR_EQ(out, ret);
 }
 
-TEST(OpSubOutKernelTest, BroadcastScalarSupported1) {
+TEST_F(OpSubOutTest, BroadcastScalarSupported1) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.make({2, 1, 3}, {2, 3, 4, 5, 6, 7});
@@ -180,7 +184,7 @@ TEST(OpSubOutKernelTest, BroadcastScalarSupported1) {
   EXPECT_TENSOR_EQ(out, ret);
 }
 
-TEST(OpSubOutKernelTest, BroadcastScalarSupported2) {
+TEST_F(OpSubOutTest, BroadcastScalarSupported2) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.make({1, 1, 1}, {8});
@@ -200,7 +204,7 @@ TEST(OpSubOutKernelTest, BroadcastScalarSupported2) {
 // Death Tests
 //
 
-TEST(OpSubOutKernelTest, IntTensorFloatAlphaDies) {
+TEST_F(OpSubOutTest, IntTensorFloatAlphaDies) {
   // op_sub_out() doesn't handle floating alpha for intergal inputs
   TensorFactory<ScalarType::Int> tf;
 
@@ -212,10 +216,10 @@ TEST(OpSubOutKernelTest, IntTensorFloatAlphaDies) {
   // Subtraction operation on two integral tensor with floating alpha
   // should cause an assertion and kill the test process.
   ET_EXPECT_KERNEL_FAILURE(
-      op_sub_out(tf.ones(sizes), tf.ones(sizes), /*alpha=*/.7, out));
+      context_, op_sub_out(tf.ones(sizes), tf.ones(sizes), /*alpha=*/.7, out));
 }
 
-TEST(OpSubOutKernelTest, BoolInputTensorsFail) {
+TEST_F(OpSubOutTest, BoolInputTensorsFail) {
   TensorFactory<ScalarType::Bool> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
@@ -225,10 +229,10 @@ TEST(OpSubOutKernelTest, BoolInputTensorsFail) {
 
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_sub_out(a, b, /*alpha=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_sub_out(a, b, /*alpha=*/1, out));
 }
 
-TEST(OpSubOutKernelTest, IntOutputWithFloatInputDies) {
+TEST_F(OpSubOutTest, IntOutputWithFloatInputDies) {
   TensorFactory<ScalarType::Int> tfi;
   TensorFactory<ScalarType::Float> tff;
 
@@ -241,10 +245,10 @@ TEST(OpSubOutKernelTest, IntOutputWithFloatInputDies) {
   // Destination for the sum.
   Tensor out = tfi.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_sub_out(a, b, /*alpha=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_sub_out(a, b, /*alpha=*/1, out));
 }
 
-TEST(OpSubOutKernelTest, BoolOutputWithIntegralInput) {
+TEST_F(OpSubOutTest, BoolOutputWithIntegralInput) {
   // add_out() doesn't handle Bool.
   TensorFactory<ScalarType::Bool> tf;
   TensorFactory<ScalarType::Int> tfi;
@@ -258,10 +262,10 @@ TEST(OpSubOutKernelTest, BoolOutputWithIntegralInput) {
   // Destination for the sum.
   Tensor out = tf.zeros(sizes);
 
-  ET_EXPECT_KERNEL_FAILURE(op_sub_out(a, b, /*alpha=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_sub_out(a, b, /*alpha=*/1, out));
 }
 
-TEST(OpSubOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpSubOutTest, MismatchedInputShapesDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // Subtrahend and minuend with different shapes.
@@ -273,10 +277,10 @@ TEST(OpSubOutKernelTest, MismatchedInputShapesDies) {
 
   // Performing substraction on two mismatched tensors should cause an assertion
   // and kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_sub_out(a, b, /*alpha=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_sub_out(a, b, /*alpha=*/0, out));
 }
 
-TEST(OpSubOutKernelTest, MismatchedOutputShapesDies) {
+TEST_F(OpSubOutTest, MismatchedOutputShapesDies) {
   if (SupportedFeatures::get()->output_resize) {
     GTEST_SKIP()
         << "The current kernel supports implicitly resizing output tensor";
@@ -295,10 +299,10 @@ TEST(OpSubOutKernelTest, MismatchedOutputShapesDies) {
 
   // Performing substraction two tensors into a mismatched output should cause
   // an assertion and kill the test process.
-  ET_EXPECT_KERNEL_FAILURE(op_sub_out(a, b, /*alpha=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_sub_out(a, b, /*alpha=*/0, out));
 }
 
-TEST(OpSubOutKernelTest, BroadcastDimSizeIsOneAB) {
+TEST_F(OpSubOutTest, BroadcastDimSizeIsOneAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -324,7 +328,7 @@ TEST(OpSubOutKernelTest, BroadcastDimSizeIsOneAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSubOutKernelTest, BroadcastDimSizeMissingAB) {
+TEST_F(OpSubOutTest, BroadcastDimSizeMissingAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -350,7 +354,7 @@ TEST(OpSubOutKernelTest, BroadcastDimSizeMissingAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSubOutKernelTest, BroadcastDimSizeIsOneBA) {
+TEST_F(OpSubOutTest, BroadcastDimSizeIsOneBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.22279858589172363, 0.3636378049850464});
@@ -376,7 +380,7 @@ TEST(OpSubOutKernelTest, BroadcastDimSizeIsOneBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSubOutKernelTest, BroadcastDimSizeMissingBA) {
+TEST_F(OpSubOutTest, BroadcastDimSizeMissingBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.22279858589172363, 0.3636378049850464});
@@ -402,7 +406,7 @@ TEST(OpSubOutKernelTest, BroadcastDimSizeMissingBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSubOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpSubOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -436,7 +440,7 @@ TEST(OpSubOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSubOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpSubOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -470,7 +474,7 @@ TEST(OpSubOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSubOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpSubOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -505,7 +509,7 @@ TEST(OpSubOutKernelTest, DynamicShapeUnbound) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpSubScalarOutKernelTest, SanityCheck) {
+TEST_F(OpSubScalarOutTest, SanityCheck) {
   TensorFactory<ScalarType::Int> tf_a;
   TensorFactory<ScalarType::Float> tf_out;
 
@@ -519,7 +523,7 @@ TEST(OpSubScalarOutKernelTest, SanityCheck) {
   EXPECT_TENSOR_EQ(out, tf_out.make(sizes, {0.25, 1.25, 3.25, 7.25}));
 }
 
-TEST(OpSubScalarOutKernelTest, OptimizedSanityCheck) {
+TEST_F(OpSubScalarOutTest, OptimizedSanityCheck) {
   TensorFactory<ScalarType::Float> tf;
 
   const std::vector<int32_t> sizes = {2, 2};
diff --git a/kernels/test/op_sum_test.cpp b/kernels/test/op_sum_test.cpp
index 427941e903..e8af989e94 100644
--- a/kernels/test/op_sum_test.cpp
+++ b/kernels/test/op_sum_test.cpp
@@ -24,54 +24,210 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_sum_intlist_out(
-    const Tensor& self,
-    optional<ArrayRef<int64_t>> dim,
-    bool keepdim,
-    optional<ScalarType> dtype,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::sum_outf(
-      context, self, dim, keepdim, dtype, out);
-}
-
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_sum_dim_out_invalid_dimensions() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
+class OpSumOutTest : public OperatorTest {
+ protected:
+  Tensor& op_sum_intlist_out(
+      const Tensor& self,
+      optional<ArrayRef<int64_t>> dim,
+      bool keepdim,
+      optional<ScalarType> dtype,
+      Tensor& out) {
+    return torch::executor::aten::sum_outf(
+        context_, self, dim, keepdim, dtype, out);
+  }
 
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2,  3,
-      4, 5, 6,  7,
-      8, 9, 10, 11,
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_sum_dim_out_invalid_dimensions() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2,  3,
+        4, 5, 6,  7,
+        8, 9, 10, 11,
+
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+      });
+    // clang-format on
+    Tensor out = tf_out.zeros({2, 3, 1});
+    optional<ScalarType> dtype = OUT_DTYPE;
+
+    // out-of-bound dim in dim list
+    int64_t dims_1[1] = {3};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_sum_intlist_out(
+            self, optional_dim_list, /*keepdim=*/true, dtype, out));
+
+    // the same dim appears multiple times in list of dims
+    int64_t dims_2[2] = {2, 2};
+    optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_sum_intlist_out(
+            self, optional_dim_list, /*keepdim=*/true, dtype, out));
+  }
 
-      12, 13, 14, 15,
-      16, 17, 18, 19,
-      20, 21, 22, 23,
-    });
-  // clang-format on
-  Tensor out = tf_out.zeros({2, 3, 1});
-  optional<ScalarType> dtype = OUT_DTYPE;
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_sum_dim_out_invalid_shape() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2,  3,
+        4, 5, 6,  7,
+        8, 9, 10, 11,
+
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+      });
+    // clang-format on
+
+    // dimension size mismatch when keepdim is true
+    Tensor out = tf_out.zeros({2, 4});
+    optional<ScalarType> dtype = OUT_DTYPE;
+    int64_t dims_1[1] = {1};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_sum_intlist_out(
+            self, optional_dim_list, /*keepdim=*/true, dtype, out));
+
+    // dimension size mismatch when keepdim is false
+    out = tf_out.zeros({2, 1, 4});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_sum_intlist_out(
+            self, optional_dim_list, /*keepdim=*/false, dtype, out));
+  }
 
-  // out-of-bound dim in dim list
-  int64_t dims_1[1] = {3};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(
-      op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out),
-      "");
-
-  // the same dim appears multiple times in list of dims
-  int64_t dims_2[2] = {2, 2};
-  optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
-  ET_EXPECT_DEATH(
-      op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out),
-      "");
-}
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_sum_dim_out_dtype() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2,  3,
+        4, 5, 6,  7,
+        8, 9, 10, 11,
+
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+      });
+    // clang-format on
+
+    // keepdim=true should work
+    Tensor out = tf_out.zeros({2, 3, 1});
+    int64_t dims_1[1] = {2};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    optional<ScalarType> dtype = OUT_DTYPE;
+    op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 3, 1},
+      {
+        6,
+        22,
+        38,
+
+        54,
+        70,
+        86
+      }));
+    // clang-format on
+
+    // keepdim=false should work
+    out = tf_out.zeros({2, 3});
+    op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/false, dtype, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 3},
+      {
+        6,  22, 38,
+        54, 70, 86
+      }));
+    // clang-format on
+
+    // dim list with multiple dimensions should work
+    out = tf_out.zeros({1, 1, 4});
+    int64_t dims_01[2] = {0, 1};
+    optional_dim_list = ArrayRef<int64_t>{dims_01, 2};
+    op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 4}, {60, 66, 72, 78}));
+
+    out = tf_out.zeros({4});
+    op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/false, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({4}, {60, 66, 72, 78}));
+
+    out = tf_out.zeros({1, 3, 1});
+    int64_t dims_02[2] = {0, 2};
+    optional_dim_list = ArrayRef<int64_t>{dims_02, 2};
+    op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 3, 1}, {60, 92, 124}));
+
+    out = tf_out.zeros({3});
+    op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/false, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({3}, {60, 92, 124}));
+
+    // dim list with negative dimensions should work
+    out = tf_out.zeros({2, 1, 4});
+    int64_t dims_3[1] = {-2};
+    optional_dim_list = ArrayRef<int64_t>{dims_3, 1};
+    op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 1, 4},
+      {
+        12, 15, 18, 21,
+
+        48, 51, 54, 57,
+      }));
+    // clang-format on
+
+    // empty/null dim list should work
+    // clang-format off
+    self = tf_in.make(
+      {2, 2, 4},
+      {
+        0, 1, 2, 3,
+        4, 5, 6, 7,
+
+        0, 1, 2, 3,
+        4, 5, 6, 7,
+      });
+    // clang-format on
+    out = tf_out.zeros({1, 1, 1});
+    optional<ArrayRef<int64_t>> null_dim_list;
+    op_sum_intlist_out(self, null_dim_list, /*keepdim=*/true, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {56}));
+
+    optional<ArrayRef<int64_t>> empty_dim_list{ArrayRef<int64_t>{}};
+    op_sum_intlist_out(self, empty_dim_list, /*keepdim=*/true, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {56}));
+
+    out = tf_out.zeros({});
+    op_sum_intlist_out(self, null_dim_list, /*keepdim=*/false, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {56}));
+
+    op_sum_intlist_out(self, empty_dim_list, /*keepdim=*/false, dtype, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {56}));
+  }
+};
 
-TEST(OpSumOutTest, InvalidDimensionListDies) {
+TEST_F(OpSumOutTest, InvalidDimensionListDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -89,43 +245,7 @@ TEST(OpSumOutTest, InvalidDimensionListDies) {
 #undef TEST_KERNEL
 }
 
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_sum_dim_out_invalid_shape() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2,  3,
-      4, 5, 6,  7,
-      8, 9, 10, 11,
-
-      12, 13, 14, 15,
-      16, 17, 18, 19,
-      20, 21, 22, 23,
-    });
-  // clang-format on
-
-  // dimension size mismatch when keepdim is true
-  Tensor out = tf_out.zeros({2, 4});
-  optional<ScalarType> dtype = OUT_DTYPE;
-  int64_t dims_1[1] = {1};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(
-      op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out),
-      "");
-
-  // dimension size mismatch when keepdim is false
-  out = tf_out.zeros({2, 1, 4});
-  ET_EXPECT_DEATH(
-      op_sum_intlist_out(
-          self, optional_dim_list, /*keepdim=*/false, dtype, out),
-      "");
-}
-
-TEST(OpSumOutTest, InvalidShapeDies) {
+TEST_F(OpSumOutTest, InvalidShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -143,7 +263,7 @@ TEST(OpSumOutTest, InvalidShapeDies) {
 #undef TEST_KERNEL
 }
 
-TEST(OpSumOutTest, MismatchedDTypesDies) {
+TEST_F(OpSumOutTest, MismatchedDTypesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -170,127 +290,13 @@ TEST(OpSumOutTest, MismatchedDTypesDies) {
   optional<ScalarType> dtype = ScalarType::Double;
 
   // out tensor should be of the same dtype with dtype when dtype is specified
-  ET_EXPECT_DEATH(
-      op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out),
-      "");
-}
-
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_sum_dim_out_dtype() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2,  3,
-      4, 5, 6,  7,
-      8, 9, 10, 11,
-
-      12, 13, 14, 15,
-      16, 17, 18, 19,
-      20, 21, 22, 23,
-    });
-  // clang-format on
-
-  // keepdim=true should work
-  Tensor out = tf_out.zeros({2, 3, 1});
-  int64_t dims_1[1] = {2};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  optional<ScalarType> dtype = OUT_DTYPE;
-  op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 3, 1},
-    {
-      6,
-      22,
-      38,
-
-      54,
-      70,
-      86
-    }));
-  // clang-format on
-
-  // keepdim=false should work
-  out = tf_out.zeros({2, 3});
-  op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/false, dtype, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 3},
-    {
-      6,  22, 38,
-      54, 70, 86
-    }));
-  // clang-format on
-
-  // dim list with multiple dimensions should work
-  out = tf_out.zeros({1, 1, 4});
-  int64_t dims_01[2] = {0, 1};
-  optional_dim_list = ArrayRef<int64_t>{dims_01, 2};
-  op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 4}, {60, 66, 72, 78}));
-
-  out = tf_out.zeros({4});
-  op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/false, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({4}, {60, 66, 72, 78}));
-
-  out = tf_out.zeros({1, 3, 1});
-  int64_t dims_02[2] = {0, 2};
-  optional_dim_list = ArrayRef<int64_t>{dims_02, 2};
-  op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 3, 1}, {60, 92, 124}));
-
-  out = tf_out.zeros({3});
-  op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/false, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({3}, {60, 92, 124}));
-
-  // dim list with negative dimensions should work
-  out = tf_out.zeros({2, 1, 4});
-  int64_t dims_3[1] = {-2};
-  optional_dim_list = ArrayRef<int64_t>{dims_3, 1};
-  op_sum_intlist_out(self, optional_dim_list, /*keepdim=*/true, dtype, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 1, 4},
-    {
-      12, 15, 18, 21,
-
-      48, 51, 54, 57,
-    }));
-  // clang-format on
-
-  // empty/null dim list should work
-  // clang-format off
-  self = tf_in.make(
-    {2, 2, 4},
-    {
-      0, 1, 2, 3,
-      4, 5, 6, 7,
-
-      0, 1, 2, 3,
-      4, 5, 6, 7,
-    });
-  // clang-format on
-  out = tf_out.zeros({1, 1, 1});
-  optional<ArrayRef<int64_t>> null_dim_list;
-  op_sum_intlist_out(self, null_dim_list, /*keepdim=*/true, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {56}));
-
-  optional<ArrayRef<int64_t>> empty_dim_list{ArrayRef<int64_t>{}};
-  op_sum_intlist_out(self, empty_dim_list, /*keepdim=*/true, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {56}));
-
-  out = tf_out.zeros({});
-  op_sum_intlist_out(self, null_dim_list, /*keepdim=*/false, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {56}));
-
-  op_sum_intlist_out(self, empty_dim_list, /*keepdim=*/false, dtype, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {56}));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_sum_intlist_out(
+          self, optional_dim_list, /*keepdim=*/true, dtype, out));
 }
 
-TEST(OpSumOutTest, AllRealInputRealOutputPasses) {
+TEST_F(OpSumOutTest, AllRealInputRealOutputPasses) {
   // Use a two layer switch to hanldle each possible data pair
 #define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
   test_sum_dim_out_dtype<ScalarType::INPUT_DTYPE, ScalarType::OUTPUT_DTYPE>();
@@ -303,7 +309,7 @@ TEST(OpSumOutTest, AllRealInputRealOutputPasses) {
 #undef TEST_KERNEL
 }
 
-TEST(OpSumOutTest, TypeConversionTest) {
+TEST_F(OpSumOutTest, TypeConversionTest) {
   TensorFactory<ScalarType::Byte> tf_byte;
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Int> tf_int;
@@ -360,7 +366,7 @@ TEST(OpSumOutTest, TypeConversionTest) {
   // clang-format on
 }
 
-TEST(OpSumOutTest, InfinityAndNANTest) {
+TEST_F(OpSumOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Float> tf_float;
   // clang-format off
   Tensor self = tf_float.make(
diff --git a/kernels/test/op_t_copy_test.cpp b/kernels/test/op_t_copy_test.cpp
index c40058f890..e7029d935b 100644
--- a/kernels/test/op_t_copy_test.cpp
+++ b/kernels/test/op_t_copy_test.cpp
@@ -21,12 +21,14 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_t_copy_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::t_copy_outf(context, self, out);
-}
+class OpTCopyTest : public OperatorTest {
+ protected:
+  Tensor& op_t_copy_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::t_copy_outf(context_, self, out);
+  }
+};
 
-TEST(OpTCopyKernelTest, 1DTranspose) {
+TEST_F(OpTCopyTest, 1DTranspose) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor t_in = tf.make({4}, {1, 2, 3, 4});
@@ -36,7 +38,7 @@ TEST(OpTCopyKernelTest, 1DTranspose) {
   EXPECT_TENSOR_EQ(t_in, t_out);
 }
 
-TEST(OpTCopyKernelTest, 1DTransposeMismatchShapeDie) {
+TEST_F(OpTCopyTest, 1DTransposeMismatchShapeDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -45,10 +47,10 @@ TEST(OpTCopyKernelTest, 1DTransposeMismatchShapeDie) {
   Tensor t_in = tf.make({4}, {1, 2, 3, 4});
   Tensor t_out = tf.make({2}, {0, 0});
 
-  ET_EXPECT_KERNEL_FAILURE(op_t_copy_out(t_in, t_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_t_copy_out(t_in, t_out));
 }
 
-TEST(OpTCopyKernelTest, 2DTranspose) {
+TEST_F(OpTCopyTest, 2DTranspose) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor t_in = tf.make({2, 3}, {1, 2, 3, 4, 5, 6});
@@ -59,7 +61,7 @@ TEST(OpTCopyKernelTest, 2DTranspose) {
   EXPECT_TENSOR_EQ(t_out, t_expected);
 }
 
-TEST(OpTCopyKernelTest, 2DTransposeMismatchShapeDie) {
+TEST_F(OpTCopyTest, 2DTransposeMismatchShapeDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched shapes";
   }
@@ -68,16 +70,16 @@ TEST(OpTCopyKernelTest, 2DTransposeMismatchShapeDie) {
   Tensor t_in = tf.make({2, 3}, {1, 2, 3, 4, 5, 6});
   Tensor t_out = tf.make({2, 2}, {0, 0, 0, 0});
 
-  ET_EXPECT_KERNEL_FAILURE(op_t_copy_out(t_in, t_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_t_copy_out(t_in, t_out));
 }
 
-TEST(OpTCopyKernelTest, 3DTransposeDie) {
+TEST_F(OpTCopyTest, 3DTransposeDie) {
   TensorFactory<ScalarType::Int> tf;
 
   Tensor t_in = tf.make({2, 3, 1}, {1, 2, 3, 4, 5, 6});
   Tensor t_out = tf.make({3, 2, 1}, {0, 0, 0, 0, 0, 0});
 
-  ET_EXPECT_KERNEL_FAILURE(op_t_copy_out(t_in, t_out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_t_copy_out(t_in, t_out));
 }
 
 /* %python
@@ -89,7 +91,7 @@ op = "op_t_copy_out"
 dtype = "ScalarType::Float"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpTCopyKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpTCopyTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -119,7 +121,7 @@ TEST(OpTCopyKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpTCopyKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpTCopyTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -152,7 +154,7 @@ TEST(OpTCopyKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpTCopyKernelTest, DynamicShapeUnbound) {
+TEST_F(OpTCopyTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_tan_test.cpp b/kernels/test/op_tan_test.cpp
index b265478ef0..560da69d8b 100644
--- a/kernels/test/op_tan_test.cpp
+++ b/kernels/test/op_tan_test.cpp
@@ -21,12 +21,49 @@ using exec_aten::Tensor;
 using exec_aten::TensorShapeDynamism;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_tan_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::tan_outf(context, self, out);
-}
+class OpTanOutTest : public OperatorTest {
+ protected:
+  Tensor& op_tan_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::tan_outf(context_, self, out);
+  }
+
+  // Common testing for tan operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_tan_out(
+      const std::vector<int32_t>& out_shape = {1, 6},
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // Destination for the tan operator.
+    Tensor out = tf_out.zeros(out_shape, dynamism);
+
+    // clang-format off
+    op_tan_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make({1, 6}, { 0.000000,  1.557408, -0.142547, -3.380515,  0.648361, -0.587214 }));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_tan_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_tan_out(in, out));
+  }
+};
 
-TEST(OpTanOutKernelTest, HandleBoolInput) {
+TEST_F(OpTanOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -39,28 +76,7 @@ TEST(OpTanOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_tan_out(a, out), res);
 }
 
-// Common testing for tan operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_tan_out(
-    const std::vector<int32_t>& out_shape = {1, 6},
-    TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // Destination for the tan operator.
-  Tensor out = tf_out.zeros(out_shape, dynamism);
-
-  // clang-format off
-  op_tan_out(tf_in.make({1, 6}, { 0, 1, 3, 5, 10, 100 }), out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make({1, 6}, { 0.000000,  1.557408, -0.142547, -3.380515,  0.648361, -0.587214 }));
-  // clang-format on
-}
-
-TEST(OpTanOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
+TEST_F(OpTanOutTest, AllRealInputHalfOutputStaticDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -70,21 +86,21 @@ TEST(OpTanOutKernelTest, AllRealInputHalfOutputStaticDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpTanOutKernelTest, AllRealInputFloatOutputStaticDynamismSupport) {
+TEST_F(OpTanOutTest, AllRealInputFloatOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_tan_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpTanOutKernelTest, AllRealInputDoubleOutputStaticDynamismSupport) {
+TEST_F(OpTanOutTest, AllRealInputDoubleOutputStaticDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_tan_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpTanOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
+TEST_F(OpTanOutTest, AllRealInputHalfOutputBoundDynamismSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -95,7 +111,7 @@ TEST(OpTanOutKernelTest, AllRealInputHalfOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpTanOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
+TEST_F(OpTanOutTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                     \
   test_floating_point_tan_out<ScalarType::dtype, ScalarType::Float>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -103,7 +119,7 @@ TEST(OpTanOutKernelTest, AllRealInputFloatOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpTanOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
+TEST_F(OpTanOutTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #define TEST_ENTRY(ctype, dtype)                                      \
   test_floating_point_tan_out<ScalarType::dtype, ScalarType::Double>( \
       {10, 10}, TensorShapeDynamism::DYNAMIC_BOUND);
@@ -111,7 +127,7 @@ TEST(OpTanOutKernelTest, AllRealInputDoubleOutputBoundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpTanOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
+TEST_F(OpTanOutTest, AllRealInputFloatOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -122,7 +138,7 @@ TEST(OpTanOutKernelTest, AllRealInputFloatOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpTanOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
+TEST_F(OpTanOutTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
   if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
@@ -133,21 +149,7 @@ TEST(OpTanOutKernelTest, AllRealInputDoubleOutputUnboundDynamismSupport) {
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_tan_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_tan_out(in, out));
-}
-
-TEST(OpTanOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpTanOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_tan_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -155,7 +157,7 @@ TEST(OpTanOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpTanOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpTanOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -164,5 +166,5 @@ TEST(OpTanOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_tan_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tan_out(a, out));
 }
diff --git a/kernels/test/op_tanh_test.cpp b/kernels/test/op_tanh_test.cpp
index 3fbe184dca..7ded964e8b 100644
--- a/kernels/test/op_tanh_test.cpp
+++ b/kernels/test/op_tanh_test.cpp
@@ -20,12 +20,56 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_tanh_out(const Tensor& self, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::tanh_outf(context, self, out);
-}
+class OpTanhOutTest : public OperatorTest {
+ protected:
+  Tensor& op_tanh_out(const Tensor& self, Tensor& out) {
+    return torch::executor::aten::tanh_outf(context_, self, out);
+  }
+
+  // Common testing for tanh operator and all kinds of supported input types
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_floating_point_tanh_out() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {1, 12};
+
+    // Destination for the tanh operator.
+    Tensor out = tf_out.zeros(sizes);
+
+    // clang-format off
+    op_tanh_out(
+        tf_in.make(sizes, /*data=*/{ 0,  1,  2,  3,   4,  5,
+                                     6,  7,  8,  9,  10,  100}),
+        out);
+  
+    // Check that it matches (or close to) the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make(
+            sizes, /*data=*/{ 0.0000000000,  0.7615941763,
+                               0.9640275836,  0.9950547814,  0.9993293285,
+                               0.9999092221,  0.9999877214,  0.9999983311,
+                               0.9999997616,  0.9999999404,  1.0000000000, 1.0000000000}));
+    // clang-format on
+  }
+
+  // Unhandled output dtypes.
+  template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
+  void test_tanh_invalid_output_dtype_dies() {
+    TensorFactory<INPUT_DTYPE> tf;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    const std::vector<int32_t> sizes = {2, 5};
+
+    Tensor in = tf.ones(sizes);
+    Tensor out = tf_out.zeros(sizes);
+
+    ET_EXPECT_KERNEL_FAILURE(context_, op_tanh_out(in, out));
+  }
+};
 
-TEST(OpTanhOutKernelTest, HandleBoolInput) {
+TEST_F(OpTanhOutTest, HandleBoolInput) {
   TensorFactory<ScalarType::Bool> tf_bool;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -38,35 +82,7 @@ TEST(OpTanhOutKernelTest, HandleBoolInput) {
   EXPECT_TENSOR_CLOSE(op_tanh_out(a, out), res);
 }
 
-// Common testing for tanh operator and all kinds of supported input types
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_floating_point_tanh_out() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {1, 12};
-
-  // Destination for the tanh operator.
-  Tensor out = tf_out.zeros(sizes);
-
-  // clang-format off
-  op_tanh_out(
-      tf_in.make(sizes, /*data=*/{ 0,  1,  2,  3,   4,  5,
-                                   6,  7,  8,  9,  10,  100}),
-      out);
-
-  // Check that it matches (or close to) the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make(
-          sizes, /*data=*/{ 0.0000000000,  0.7615941763,
-                             0.9640275836,  0.9950547814,  0.9993293285,
-                             0.9999092221,  0.9999877214,  0.9999983311,
-                             0.9999997616,  0.9999999404,  1.0000000000, 1.0000000000}));
-  // clang-format on
-}
-
-TEST(OpTanhOutKernelTest, AllRealInputHalfOutputSupport) {
+TEST_F(OpTanhOutTest, AllRealInputHalfOutputSupport) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "Test Half support only for ExecuTorch mode";
   }
@@ -76,35 +92,21 @@ TEST(OpTanhOutKernelTest, AllRealInputHalfOutputSupport) {
 #undef TEST_ENTRY
 }
 
-TEST(OpTanhOutKernelTest, AllRealInputFloatOutputSupport) {
+TEST_F(OpTanhOutTest, AllRealInputFloatOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_tanh_out<ScalarType::dtype, ScalarType::Float>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpTanhOutKernelTest, AllRealInputDoubleOutputSupport) {
+TEST_F(OpTanhOutTest, AllRealInputDoubleOutputSupport) {
 #define TEST_ENTRY(ctype, dtype) \
   test_floating_point_tanh_out<ScalarType::dtype, ScalarType::Double>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-// Unhandled output dtypes.
-template <ScalarType INPUT_DTYPE, ScalarType OUTPUT_DTYPE>
-void test_tanh_invalid_output_dtype_dies() {
-  TensorFactory<INPUT_DTYPE> tf;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  const std::vector<int32_t> sizes = {2, 5};
-
-  Tensor in = tf.ones(sizes);
-  Tensor out = tf_out.zeros(sizes);
-
-  ET_EXPECT_KERNEL_FAILURE(op_tanh_out(in, out));
-}
-
-TEST(OpTanhOutKernelTest, AllNonFloatOutputDTypeDies) {
+TEST_F(OpTanhOutTest, AllNonFloatOutputDTypeDies) {
 #define TEST_ENTRY(ctype, dtype) \
   test_tanh_invalid_output_dtype_dies<ScalarType::Float, ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
@@ -112,7 +114,7 @@ TEST(OpTanhOutKernelTest, AllNonFloatOutputDTypeDies) {
 }
 
 // Mismatched shape tests.
-TEST(OpTanhOutKernelTest, MismatchedInputShapesDies) {
+TEST_F(OpTanhOutTest, MismatchedInputShapesDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
@@ -121,10 +123,10 @@ TEST(OpTanhOutKernelTest, MismatchedInputShapesDies) {
   Tensor a = tf.ones(/*sizes=*/{4});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_tanh_out(a, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tanh_out(a, out));
 }
 
-TEST(OpTanhOutKernelTest, SimpleGeneratedCase) {
+TEST_F(OpTanhOutTest, SimpleGeneratedCase) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -178,7 +180,7 @@ TEST(OpTanhOutKernelTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpTanhOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpTanhOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -204,7 +206,7 @@ TEST(OpTanhOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpTanhOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpTanhOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -230,7 +232,7 @@ TEST(OpTanhOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpTanhOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpTanhOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_test.h b/kernels/test/op_test.h
new file mode 100644
index 0000000000..f425138e94
--- /dev/null
+++ b/kernels/test/op_test.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Utilities for kernel / operator tests.
+ */
+
+#pragma once
diff --git a/kernels/test/op_to_copy_test.cpp b/kernels/test/op_to_copy_test.cpp
index 0060b43117..1cc892dedb 100644
--- a/kernels/test/op_to_copy_test.cpp
+++ b/kernels/test/op_to_copy_test.cpp
@@ -27,74 +27,107 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_to_copy_out(
-    const Tensor& self,
-    bool non_blocking,
-    optional<MemoryFormat> memory_format,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::_to_copy_outf(
-      context, self, non_blocking, memory_format, out);
-}
+// To further emphasize the accuracy of our op_to, we test the conversion
+// from floating-point types to signed int types directly by the test cases
+// generated by core Pytorch directly. Such data is random generated in [-5, 5].
 
-/* Here we temporary not try to implement or test the behavior about casting a
- * number can not be represented in some type to this type (e.g. inf to int32_t
- * nan to int64_t or 2147483648 to int32_t), because
- * - a. The result of such kind of cast is undefined according to c++ standard;
- * - b. No explicit rules can be found in core pytorch for such transaction (not
- *      same as static_cast or any other casting function in c++);
- * - c. If user tries to cast a unrepresentable value to certain type, they
- *      should take the risk;
- * - d. Even though we can always use if/switch to cover these boundry cases,
- *      the code will be lengthy and jumbled. I believe using these disordered
- *      code to meet some undefine behavior is meaningless, and we can not
- *      cover all such cases.
- */
+// clang-format off
+typedef std::map<
+          std::type_index,
+          std::variant<
+            std::vector<float>,
+            std::vector<double>>>
+        FloatingTypeToDataMap;
 
-namespace {
+typedef std::map<
+          std::type_index,
+          std::variant<
+              std::vector<int64_t>,
+              std::vector<int32_t>,
+              std::vector<int16_t>,
+              std::vector<int8_t>,
+              std::vector<uint8_t>>>
+        IntTypeToDataMap;
+// clang-format on
 
-// Cast float vector to OUTPUT_CTYPE vector
-template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
-std::vector<OUTPUT_CTYPE> vector_type_cast(std::vector<INPUT_CTYPE> input) {
-  std::vector<OUTPUT_CTYPE> output(input.size());
-  std::transform(input.begin(), input.end(), output.begin(), [](INPUT_CTYPE x) {
-    return static_cast<OUTPUT_CTYPE>(x);
-  });
-  return output;
-}
-} // namespace
+class OpToTest : public OperatorTest {
+ protected:
+  Tensor& op_to_copy_out(
+      const Tensor& self,
+      bool non_blocking,
+      optional<MemoryFormat> memory_format,
+      Tensor& out) {
+    return torch::executor::aten::_to_copy_outf(
+        context_, self, non_blocking, memory_format, out);
+  }
 
-template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
-struct ToTestCase {
-  const std::vector<int32_t> sizes;
-  const std::vector<INPUT_CTYPE> data_in;
-  const std::vector<OUTPUT_CTYPE> data_out;
-};
+  // Cast float vector to OUTPUT_CTYPE vector
+  template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
+  std::vector<OUTPUT_CTYPE> vector_type_cast(std::vector<INPUT_CTYPE> input) {
+    std::vector<OUTPUT_CTYPE> output(input.size());
+    std::transform(
+        input.begin(), input.end(), output.begin(), [](INPUT_CTYPE x) {
+          return static_cast<OUTPUT_CTYPE>(x);
+        });
+    return output;
+  }
+  template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
+  struct ToTestCase {
+    const std::vector<int32_t> sizes;
+    const std::vector<INPUT_CTYPE> data_in;
+    const std::vector<OUTPUT_CTYPE> data_out;
+  };
+
+  // Each test has different combination of input and output types. Therefore it
+  // is a little bit mess if create template test case and custom data types for
+  // both input data and output data.
+  // We choose another way: for all test cases, their data are all in double.
+  // And we are gonna cast them into desired type when delievering them into
+  // tf.make function. Based on our experiments, type cast of core PyTorch is
+  // same as static_cast in c++ in the representable scope, so here we believe
+  // using static_cast to generate ground truth is reasonable.
+  template <
+      typename INPUT_CTYPE,
+      ScalarType INPUT_DTYPE,
+      typename OUTPUT_CTYPE,
+      ScalarType OUTPUT_DTYPE>
+  void test_runner_static_cast(
+      std::vector<ToTestCase<double, double>> test_cases) {
+    TensorFactory<INPUT_DTYPE> tf_in;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    for (auto test_case : test_cases) {
+      auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case.data_in);
+      auto data_out = vector_type_cast<INPUT_CTYPE, OUTPUT_CTYPE>(data_in);
+
+      Tensor input = tf_in.make(test_case.sizes, data_in);
+      Tensor output = tf_out.zeros_like(input);
+
+      Tensor ret = op_to_copy_out(
+          /*self=*/input,
+          /*non_blocking=*/false,
+          exec_aten::MemoryFormat::Contiguous,
+          output);
+
+      Tensor expected = tf_out.make(test_case.sizes, data_out);
+
+      // The original tensor a should share same value with the out variable and
+      // return variable of to function
+      EXPECT_TENSOR_EQ(ret, output);
+      EXPECT_TENSOR_EQ(ret, expected);
+    }
+  }
+
+  template <typename INPUT_CTYPE, ScalarType INPUT_DTYPE>
+  void test_runner_to_bool(
+      std::vector<double> test_case,
+      std::vector<uint8_t> data_out) {
+    TensorFactory<INPUT_DTYPE> tf_in;
+    TensorFactory<ScalarType::Bool> tf_out;
+
+    auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case);
 
-// Each test has different combination of input and output types. Therefore it
-// is a little bit mess if create template test case and custom data types for
-// both input data and output data.
-// We choose another way: for all test cases, their data are all in double. And
-// we are gonna cast them into desired type when delievering them into tf.make
-// function.
-// Based on our experiments, type cast of core PyTorch is same as static_cast
-// in c++ in the representable scope, so here we believe using static_cast to
-// generate ground truth is reasonable.
-template <
-    typename INPUT_CTYPE,
-    ScalarType INPUT_DTYPE,
-    typename OUTPUT_CTYPE,
-    ScalarType OUTPUT_DTYPE>
-void test_runner_static_cast(
-    std::vector<ToTestCase<double, double>> test_cases) {
-  TensorFactory<INPUT_DTYPE> tf_in;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  for (auto test_case : test_cases) {
-    auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case.data_in);
-    auto data_out = vector_type_cast<INPUT_CTYPE, OUTPUT_CTYPE>(data_in);
-
-    Tensor input = tf_in.make(test_case.sizes, data_in);
+    Tensor input = tf_in.make({(int)test_case.size()}, data_in);
     Tensor output = tf_out.zeros_like(input);
 
     Tensor ret = op_to_copy_out(
@@ -103,18 +136,152 @@ void test_runner_static_cast(
         exec_aten::MemoryFormat::Contiguous,
         output);
 
-    Tensor expected = tf_out.make(test_case.sizes, data_out);
+    Tensor expected = tf_out.make({(int)data_out.size()}, data_out);
+
+    // The return value of op_to_copy_out and the values written to output
+    // should be the same.
+    EXPECT_TENSOR_EQ(ret, output);
+    // The return value of op_to_copy_out and the values in expected which are
+    // the reference values should be the same.
+    EXPECT_TENSOR_EQ(ret, expected);
+  }
+
+  template <typename OUT_CTYPE, ScalarType OUT_DTYPE>
+  void test_runner_from_bool(
+      std::vector<uint8_t> test_case,
+      std::vector<double> out) {
+    TensorFactory<ScalarType::Bool> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    auto data_out = vector_type_cast<double, OUT_CTYPE>(out);
+
+    Tensor input = tf_in.make({(int)test_case.size()}, test_case);
+    Tensor output = tf_out.zeros_like(input);
+
+    Tensor ret = op_to_copy_out(
+        /*self=*/input,
+        /*non_blocking=*/false,
+        exec_aten::MemoryFormat::Contiguous,
+        output);
+
+    Tensor expected = tf_out.make({(int)data_out.size()}, data_out);
+
+    // The return value of op_to_copy_out and the values written to output
+    // should be the same.
+    EXPECT_TENSOR_EQ(ret, output);
+    // The return value of op_to_copy_out and the values in expected which are
+    // the reference values should be the same.
+    EXPECT_TENSOR_EQ(ret, expected);
+  }
+
+  template <
+      typename INPUT_CTYPE,
+      ScalarType INPUT_DTYPE,
+      typename OUTPUT_CTYPE,
+      ScalarType OUTPUT_DTYPE>
+  void test_runner_hardcode_data(
+      FloatingTypeToDataMap floating_point_data,
+      IntTypeToDataMap int_data) {
+    TensorFactory<INPUT_DTYPE> tf_in;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    if (typeid(OUTPUT_CTYPE) == typeid(uint8_t)) {
+      // Would cause underflow when testing uint8_t.
+      return;
+    }
+
+    ToTestCase<INPUT_CTYPE, OUTPUT_CTYPE> test_case = {
+        /*sizes=*/{3, 5}, /*data_in=*/
+        std::get<std::vector<INPUT_CTYPE>>(
+            floating_point_data[typeid(INPUT_CTYPE)]),
+        /*data_out=*/
+        std::get<std::vector<OUTPUT_CTYPE>>(int_data[typeid(OUTPUT_CTYPE)])};
+
+    Tensor input = tf_in.make(test_case.sizes, test_case.data_in);
+    Tensor output = tf_out.zeros_like(input);
+
+    Tensor ret = op_to_copy_out(
+        /*self=*/input,
+        /*non_blocking=*/false,
+        exec_aten::MemoryFormat::Contiguous,
+        output);
+
+    Tensor expected = tf_out.make(test_case.sizes, test_case.data_out);
 
     // The original tensor a should share same value with the out variable and
     // return variable of to function
     EXPECT_TENSOR_EQ(ret, output);
     EXPECT_TENSOR_EQ(ret, expected);
   }
-}
+
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  x = torch.rand(2, 3)
+  res = x.to(non_blocking = False, memory_format = torch.preserve_format)
+  op = "op_to_copy_out"
+  opt_setup_params = """
+    bool non_blocking = false;
+    optional<MemoryFormat> memory_format;
+  """
+  opt_extra_params = "non_blocking, memory_format,"
+  out_args = "out_shape, dynamism"
+  dtype = "ScalarType::Float"
+  check = "EXPECT_TENSOR_EQ" */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(unary_op) */
+
+    TensorFactory<ScalarType::Float> tf;
+
+    Tensor x = tf.make(
+        {2, 3},
+        {0.49625658988952637,
+         0.7682217955589294,
+         0.08847743272781372,
+         0.13203048706054688,
+         0.30742281675338745,
+         0.6340786814689636});
+    Tensor expected = tf.make(
+        {2, 3},
+        {0.49625658988952637,
+         0.7682217955589294,
+         0.08847743272781372,
+         0.13203048706054688,
+         0.30742281675338745,
+         0.6340786814689636});
+
+    bool non_blocking = false;
+    optional<MemoryFormat> memory_format;
+
+    Tensor out = tf.zeros(out_shape, dynamism);
+    op_to_copy_out(x, non_blocking, memory_format, out);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+/* Here we temporary not try to implement or test the behavior about casting a
+ * number can not be represented in some type to this type (e.g. inf to int32_t
+ * nan to int64_t or 2147483648 to int32_t), because
+ * - a. The result of such kind of cast is undefined according to c++ standard;
+ * - b. No explicit rules can be found in core pytorch for such transaction (not
+ *      same as static_cast or any other casting function in c++);
+ * - c. If user tries to cast a unrepresentable value to certain type, they
+ *      should take the risk;
+ * - d. Even though we can always use if/switch to cover these boundry cases,
+ *      the code will be lengthy and jumbled. I believe using these disordered
+ *      code to meet some undefine behavior is meaningless, and we can not
+ *      cover all such cases.
+ */
+
+namespace {} // namespace
 
 // Regular test for to_copy.out
 // Test if to_copy.out works well under all kinds of data pairs
-TEST(OpToTest, AllDtypesSupported) {
+TEST_F(OpToTest, AllDtypesSupported) {
   std::vector<ToTestCase<double, double>> test_cases = {
       {
           /*sizes=*/{2, 4}, /*data_in=*/
@@ -150,63 +317,7 @@ TEST(OpToTest, AllDtypesSupported) {
 #undef TEST_KERNEL
 }
 
-template <typename INPUT_CTYPE, ScalarType INPUT_DTYPE>
-void test_runner_to_bool(
-    std::vector<double> test_case,
-    std::vector<uint8_t> data_out) {
-  TensorFactory<INPUT_DTYPE> tf_in;
-  TensorFactory<ScalarType::Bool> tf_out;
-
-  auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case);
-
-  Tensor input = tf_in.make({(int)test_case.size()}, data_in);
-  Tensor output = tf_out.zeros_like(input);
-
-  Tensor ret = op_to_copy_out(
-      /*self=*/input,
-      /*non_blocking=*/false,
-      exec_aten::MemoryFormat::Contiguous,
-      output);
-
-  Tensor expected = tf_out.make({(int)data_out.size()}, data_out);
-
-  // The return value of op_to_copy_out and the values written to output
-  // should be the same.
-  EXPECT_TENSOR_EQ(ret, output);
-  // The return value of op_to_copy_out and the values in expected which are
-  // the reference values should be the same.
-  EXPECT_TENSOR_EQ(ret, expected);
-}
-
-template <typename OUT_CTYPE, ScalarType OUT_DTYPE>
-void test_runner_from_bool(
-    std::vector<uint8_t> test_case,
-    std::vector<double> out) {
-  TensorFactory<ScalarType::Bool> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  auto data_out = vector_type_cast<double, OUT_CTYPE>(out);
-
-  Tensor input = tf_in.make({(int)test_case.size()}, test_case);
-  Tensor output = tf_out.zeros_like(input);
-
-  Tensor ret = op_to_copy_out(
-      /*self=*/input,
-      /*non_blocking=*/false,
-      exec_aten::MemoryFormat::Contiguous,
-      output);
-
-  Tensor expected = tf_out.make({(int)data_out.size()}, data_out);
-
-  // The return value of op_to_copy_out and the values written to output
-  // should be the same.
-  EXPECT_TENSOR_EQ(ret, output);
-  // The return value of op_to_copy_out and the values in expected which are
-  // the reference values should be the same.
-  EXPECT_TENSOR_EQ(ret, expected);
-}
-
-TEST(OpToTest, BoolTests) {
+TEST_F(OpToTest, BoolTests) {
   std::vector<double> test_case_to_bool = {1.1, 2.2, 0};
   std::vector<uint8_t> result_to_bool = {true, true, false};
 #define TEST_TO_BOOL(INPUT_CTYPE, INPUT_DTYPE)               \
@@ -222,7 +333,7 @@ TEST(OpToTest, BoolTests) {
   ET_FORALL_REAL_TYPES(TEST_FROM_BOOL);
 }
 
-TEST(OpToTest, NanInfSupported) {
+TEST_F(OpToTest, NanInfSupported) {
   constexpr auto floatInfinity = std::numeric_limits<float>::infinity();
   std::vector<ToTestCase<double, double>> test_cases = {{
       /*sizes=*/{2, 4},
@@ -246,70 +357,7 @@ TEST(OpToTest, NanInfSupported) {
 #undef TEST_KERNEL
 }
 
-// To further emphasize the accuracy of our op_to, we test the conversion
-// from floating-point types to signed int types directly by the test cases
-// generated by core Pytorch directly. Such data is random generated in [-5, 5].
-
-// clang-format off
-typedef std::map<
-          std::type_index,
-          std::variant<
-            std::vector<float>,
-            std::vector<double>>>
-        FloatingTypeToDataMap;
-
-typedef std::map<
-          std::type_index,
-          std::variant<
-              std::vector<int64_t>,
-              std::vector<int32_t>,
-              std::vector<int16_t>,
-              std::vector<int8_t>,
-              std::vector<uint8_t>>>
-        IntTypeToDataMap;
-// clang-format on
-
-template <
-    typename INPUT_CTYPE,
-    ScalarType INPUT_DTYPE,
-    typename OUTPUT_CTYPE,
-    ScalarType OUTPUT_DTYPE>
-void test_runner_hardcode_data(
-    FloatingTypeToDataMap floating_point_data,
-    IntTypeToDataMap int_data) {
-  TensorFactory<INPUT_DTYPE> tf_in;
-  TensorFactory<OUTPUT_DTYPE> tf_out;
-
-  if (typeid(OUTPUT_CTYPE) == typeid(uint8_t)) {
-    // Would cause underflow when testing uint8_t.
-    return;
-  }
-
-  ToTestCase<INPUT_CTYPE, OUTPUT_CTYPE> test_case = {
-      /*sizes=*/{3, 5}, /*data_in=*/
-      std::get<std::vector<INPUT_CTYPE>>(
-          floating_point_data[typeid(INPUT_CTYPE)]),
-      /*data_out=*/
-      std::get<std::vector<OUTPUT_CTYPE>>(int_data[typeid(OUTPUT_CTYPE)])};
-
-  Tensor input = tf_in.make(test_case.sizes, test_case.data_in);
-  Tensor output = tf_out.zeros_like(input);
-
-  Tensor ret = op_to_copy_out(
-      /*self=*/input,
-      /*non_blocking=*/false,
-      exec_aten::MemoryFormat::Contiguous,
-      output);
-
-  Tensor expected = tf_out.make(test_case.sizes, test_case.data_out);
-
-  // The original tensor a should share same value with the out variable and
-  // return variable of to function
-  EXPECT_TENSOR_EQ(ret, output);
-  EXPECT_TENSOR_EQ(ret, expected);
-}
-
-TEST(OpToTest, HardcodeFloatConvertInt) {
+TEST_F(OpToTest, HardcodeFloatConvertInt) {
   // Hardcode input and output generated from core PyTorch
   // clang-format off
   std::vector<float> float_data = {
@@ -367,24 +415,26 @@ TEST(OpToTest, HardcodeFloatConvertInt) {
   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
 }
 
-TEST(OpToTest, MismatchedSizesDie) {
+TEST_F(OpToTest, MismatchedSizesDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched sizes";
   }
   TensorFactory<ScalarType::Int> tf;
   Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf.zeros({3, 2, 1, 1});
-  ET_EXPECT_KERNEL_FAILURE(op_to_copy_out(
-      input,
-      /*non_blocking=*/false,
-      exec_aten::MemoryFormat::Contiguous,
-      out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_to_copy_out(
+          input,
+          /*non_blocking=*/false,
+          exec_aten::MemoryFormat::Contiguous,
+          out));
 }
 
 // Only contiguous memory is supported, the memory type MemoryFormat::Contiguous
 // should not be allowed. The function is expected death if using the illegal
 // memory format.
-TEST(OpToTest, MismatchedMemoryFormatDies) {
+TEST_F(OpToTest, MismatchedMemoryFormatDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle non contiguous memory formats";
   }
@@ -394,11 +444,13 @@ TEST(OpToTest, MismatchedMemoryFormatDies) {
       tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf_out.zeros({3, 1, 1, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_to_copy_out(
-      input,
-      /*non_blocking=*/false,
-      static_cast<exec_aten::MemoryFormat>(55),
-      out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_to_copy_out(
+          input,
+          /*non_blocking=*/false,
+          static_cast<exec_aten::MemoryFormat>(55),
+          out));
   // memory format can be null
   EXPECT_TENSOR_EQ(
       op_to_copy_out(
@@ -410,79 +462,33 @@ TEST(OpToTest, MismatchedMemoryFormatDies) {
 }
 
 // Only blocking data transfer supported
-TEST(OpToTest, MismatchedBlockingDie) {
+TEST_F(OpToTest, MismatchedBlockingDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle non blocking data transfer";
   }
   TensorFactory<ScalarType::Int> tf;
   Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf.zeros(/*sizes=*/{3, 1, 1, 2});
-  ET_EXPECT_KERNEL_FAILURE(op_to_copy_out(
-      input,
-      /*non_blocking=*/true,
-      exec_aten::MemoryFormat::Contiguous,
-      out));
-}
-
-/* %python
-import torch
-torch.manual_seed(0)
-x = torch.rand(2, 3)
-res = x.to(non_blocking = False, memory_format = torch.preserve_format)
-op = "op_to_copy_out"
-opt_setup_params = """
-  bool non_blocking = false;
-  optional<MemoryFormat> memory_format;
-"""
-opt_extra_params = "non_blocking, memory_format,"
-out_args = "out_shape, dynamism"
-dtype = "ScalarType::Float"
-check = "EXPECT_TENSOR_EQ" */
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(unary_op) */
-
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor x = tf.make(
-      {2, 3},
-      {0.49625658988952637,
-       0.7682217955589294,
-       0.08847743272781372,
-       0.13203048706054688,
-       0.30742281675338745,
-       0.6340786814689636});
-  Tensor expected = tf.make(
-      {2, 3},
-      {0.49625658988952637,
-       0.7682217955589294,
-       0.08847743272781372,
-       0.13203048706054688,
-       0.30742281675338745,
-       0.6340786814689636});
-
-  bool non_blocking = false;
-  optional<MemoryFormat> memory_format;
-
-  Tensor out = tf.zeros(out_shape, dynamism);
-  op_to_copy_out(x, non_blocking, memory_format, out);
-  EXPECT_TENSOR_EQ(out, expected);
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_to_copy_out(
+          input,
+          /*non_blocking=*/true,
+          exec_aten::MemoryFormat::Contiguous,
+          out));
 }
 
-TEST(OpToTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpToTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpToTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpToTest, DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpToTest, DynamicShapeUnbound) {
+TEST_F(OpToTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/op_transpose_copy_test.cpp b/kernels/test/op_transpose_copy_test.cpp
index 7e3e23c5df..c874257a53 100644
--- a/kernels/test/op_transpose_copy_test.cpp
+++ b/kernels/test/op_transpose_copy_test.cpp
@@ -21,17 +21,19 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_transpose_copy_int_out(
-    const Tensor& self,
-    int64_t dim0,
-    int64_t dim1,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::transpose_copy_outf(
-      context, self, dim0, dim1, out);
-}
+class OpTransposeIntCopyTest : public OperatorTest {
+ protected:
+  Tensor& op_transpose_copy_int_out(
+      const Tensor& self,
+      int64_t dim0,
+      int64_t dim1,
+      Tensor& out) {
+    return torch::executor::aten::transpose_copy_outf(
+        context_, self, dim0, dim1, out);
+  }
+};
 
-TEST(OpTransposeIntCopyKernelTest, TwoDTranspose) {
+TEST_F(OpTransposeIntCopyTest, TwoDTranspose) {
   TensorFactory<ScalarType::Int> tf;
 
   // clang-format off
@@ -56,7 +58,7 @@ TEST(OpTransposeIntCopyKernelTest, TwoDTranspose) {
   // clang-format on
 }
 
-TEST(OpTransposeIntCopyKernelTest, TwoDNegativeIndices) {
+TEST_F(OpTransposeIntCopyTest, TwoDNegativeIndices) {
   TensorFactory<ScalarType::Int> tf;
 
   // clang-format off
@@ -81,7 +83,7 @@ TEST(OpTransposeIntCopyKernelTest, TwoDNegativeIndices) {
   // clang-format on
 }
 
-TEST(OpTransposeIntCopyKernelTest, TransposeNoDatachange) {
+TEST_F(OpTransposeIntCopyTest, TransposeNoDatachange) {
   TensorFactory<ScalarType::Int> tf;
 
   // clang-format off
@@ -111,7 +113,7 @@ TEST(OpTransposeIntCopyKernelTest, TransposeNoDatachange) {
   // clang-format on
 }
 
-TEST(OpTransposeIntCopyKernelTest, ThreeDTranspose) {
+TEST_F(OpTransposeIntCopyTest, ThreeDTranspose) {
   TensorFactory<ScalarType::Int> tf;
 
   // clang-format off
@@ -145,17 +147,17 @@ TEST(OpTransposeIntCopyKernelTest, ThreeDTranspose) {
 }
 
 // transpose an out of bounds dim
-TEST(OpTransposeIntCopyKernelTest, OutOfBoundDimDies) {
+TEST_F(OpTransposeIntCopyTest, OutOfBoundDimDies) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor a = tf.ones(/*sizes=*/{2, 3});
   Tensor out = tf.ones(/*sizes=*/{3, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_transpose_copy_int_out(a, 0, -3, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_transpose_copy_int_out(a, 0, -3, out));
 }
 
 // transpose a 3d tensor into a 2d one
-TEST(OpTransposeIntCopyKernelTest, MismatchedDimDies) {
+TEST_F(OpTransposeIntCopyTest, MismatchedDimDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched dimensions";
   }
@@ -164,7 +166,7 @@ TEST(OpTransposeIntCopyKernelTest, MismatchedDimDies) {
   Tensor a = tf.ones(/*sizes=*/{4, 2, 3});
   Tensor out = tf.ones(/*sizes=*/{2, 2});
 
-  ET_EXPECT_KERNEL_FAILURE(op_transpose_copy_int_out(a, 0, 1, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_transpose_copy_int_out(a, 0, 1, out));
 }
 
 /* %python
@@ -177,7 +179,7 @@ opt_extra_params = "0, 2,"
 dtype = "ScalarType::Int"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpTransposeIntCopyKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpTransposeIntCopyTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{3, 2, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -193,7 +195,7 @@ TEST(OpTransposeIntCopyKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpTransposeIntCopyKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpTransposeIntCopyTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -212,7 +214,7 @@ TEST(OpTransposeIntCopyKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpTransposeIntCopyKernelTest, DynamicShapeUnbound) {
+TEST_F(OpTransposeIntCopyTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_tril_test.cpp b/kernels/test/op_tril_test.cpp
index b6cec9bbf4..6cf49e1887 100644
--- a/kernels/test/op_tril_test.cpp
+++ b/kernels/test/op_tril_test.cpp
@@ -20,17 +20,18 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_tril_out(const Tensor& self, int64_t diagonal, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::tril_outf(context, self, diagonal, out);
-}
+class OpTrilTest : public OperatorTest {
+ protected:
+  Tensor& op_tril_out(const Tensor& self, int64_t diagonal, Tensor& out) {
+    return torch::executor::aten::tril_outf(context_, self, diagonal, out);
+  }
 
-// Assert `self` and `out` as zero tensors is a no-op.
-template <ScalarType DTYPE>
-void test_tril_out_zeros() {
-  TensorFactory<DTYPE> tf;
+  // Assert `self` and `out` as zero tensors is a no-op.
+  template <ScalarType DTYPE>
+  void test_tril_out_zeros() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -40,13 +41,13 @@ void test_tril_out_zeros() {
         0,  0,  0, //         [ 0,  0,  0]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.zeros({3, 3});
+    Tensor out = tf.zeros({3, 3});
 
-  op_tril_out(self, 0, out);
+    op_tril_out(self, 0, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -56,17 +57,17 @@ void test_tril_out_zeros() {
         0,  0,  0, //         [ 0,  0,  0]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `out` as a non-zero tensor yields correct results.
-template <ScalarType DTYPE>
-void test_tril_out_ones() {
-  TensorFactory<DTYPE> tf;
+  // Assert `out` as a non-zero tensor yields correct results.
+  template <ScalarType DTYPE>
+  void test_tril_out_ones() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -76,13 +77,13 @@ void test_tril_out_ones() {
         0,  0,  0, //         [ 0,  0,  0]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.ones({3, 3});
+    Tensor out = tf.ones({3, 3});
 
-  op_tril_out(self, 0, out);
+    op_tril_out(self, 0, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -92,34 +93,34 @@ void test_tril_out_ones() {
         0,  0,  0, //         [ 0,  0,  0]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `tril` works with multiple empty dims.
-template <ScalarType DTYPE>
-void test_tril_out_empty_dims() {
-  TensorFactory<DTYPE> tf;
-  Tensor out = tf.zeros({1, 1, 1, 1});
+  // Assert `tril` works with multiple empty dims.
+  template <ScalarType DTYPE>
+  void test_tril_out_empty_dims() {
+    TensorFactory<DTYPE> tf;
+    Tensor out = tf.zeros({1, 1, 1, 1});
 
-  // tensor([[[[1]]]])
-  Tensor self = tf.ones({1, 1, 1, 1});
+    // tensor([[[[1]]]])
+    Tensor self = tf.ones({1, 1, 1, 1});
 
-  op_tril_out(self, 0, out);
+    op_tril_out(self, 0, out);
 
-  // tensor([[[[1]]]])
-  Tensor result = tf.ones({1, 1, 1, 1});
+    // tensor([[[[1]]]])
+    Tensor result = tf.ones({1, 1, 1, 1});
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `tril` works with a square tensor.
-template <ScalarType DTYPE>
-void test_tril_out_square() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works with a square tensor.
+  template <ScalarType DTYPE>
+  void test_tril_out_square() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -129,13 +130,13 @@ void test_tril_out_square() {
         1,  1,  1, //         [ 1,  1,  1]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.zeros({3, 3});
+    Tensor out = tf.zeros({3, 3});
 
-  op_tril_out(self, 0, out);
+    op_tril_out(self, 0, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -145,17 +146,17 @@ void test_tril_out_square() {
         1,  1,  1, //         [ 1,  1,  1]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `tril` works with a rectangular tensor.
-template <ScalarType DTYPE>
-void test_tril_out_rectangle() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works with a rectangular tensor.
+  template <ScalarType DTYPE>
+  void test_tril_out_rectangle() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 5},
     /*data=*/
@@ -165,13 +166,13 @@ void test_tril_out_rectangle() {
         1,  1,  1,  1,  1, //         [ 1,  1,  1,  1,  1]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.zeros({3, 5});
+    Tensor out = tf.zeros({3, 5});
 
-  op_tril_out(self, 0, out);
+    op_tril_out(self, 0, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 5},
     /*data=*/
@@ -181,17 +182,17 @@ void test_tril_out_rectangle() {
         1,  1,  1,  0,  0, //         [ 1,  1,  1,  0,  0]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `tril` works with a positive diagonal value.
-template <ScalarType DTYPE>
-void test_tril_out_pos_diag() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works with a positive diagonal value.
+  template <ScalarType DTYPE>
+  void test_tril_out_pos_diag() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -201,13 +202,13 @@ void test_tril_out_pos_diag() {
         1,  1,  1, //         [ 1,  1,  1]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.zeros({3, 3});
+    Tensor out = tf.zeros({3, 3});
 
-  op_tril_out(self, 1, out);
+    op_tril_out(self, 1, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -217,17 +218,17 @@ void test_tril_out_pos_diag() {
         1,  1,  1, //         [ 1,  1,  1]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `tril` works with a negative diagonal value.
-template <ScalarType DTYPE>
-void test_tril_out_neg_diag() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works with a negative diagonal value.
+  template <ScalarType DTYPE>
+  void test_tril_out_neg_diag() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -237,13 +238,13 @@ void test_tril_out_neg_diag() {
         1,  1,  1, //         [ 1,  1,  1]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.zeros({3, 3});
+    Tensor out = tf.zeros({3, 3});
 
-  op_tril_out(self, -1, out);
+    op_tril_out(self, -1, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -253,17 +254,17 @@ void test_tril_out_neg_diag() {
         1,  1,  0, //         [ 1,  1,  0]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `tril` works with a batch of tensors, where dims are equal.
-template <ScalarType DTYPE>
-void test_tril_out_multi_equal_dim() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works with a batch of tensors, where dims are equal.
+  template <ScalarType DTYPE>
+  void test_tril_out_multi_equal_dim() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3, 3},
     /*data=*/
@@ -281,13 +282,13 @@ void test_tril_out_multi_equal_dim() {
         1,  1,  1, //          [ 1,  1,  1]]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.zeros({3, 3, 3});
+    Tensor out = tf.zeros({3, 3, 3});
 
-  op_tril_out(self, 0, out);
+    op_tril_out(self, 0, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 3, 3},
     /*data=*/
@@ -305,38 +306,49 @@ void test_tril_out_multi_equal_dim() {
         1,  1,  1, //          [ 1,  1,  1]]])
     }
   );
-  // clang-format on
-
-  EXPECT_TENSOR_EQ(out, result);
-}
-
-// Assert `tril` works with a batch of tensors, where dims are unequal.
-template <ScalarType DTYPE>
-void test_tril_out_multi_unequal_dim() {
-  TensorFactory<DTYPE> tf;
-
-  // clang-format off
-  Tensor self = tf.make(
-    /*sizes=*/{3, 2, 3},
-    /*data=*/
-    {
-        1,  1,  1, // tensor([[[ 1,  1,  1],
-        1,  1,  1, //          [ 1,  1,  1]],
-
-        1,  1,  1, //         [[ 1,  1,  1],
-        1,  1,  1, //          [ 1,  1,  1]],
-
-        1,  1,  1, //         [[ 1,  1,  1],
-        1,  1,  1, //          [ 1,  1,  1]]])
-    }
-  );
-  // clang-format on
-
-  Tensor out = tf.zeros({3, 2, 3});
+    // clang-format on
 
-  op_tril_out(self, 0, out);
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-  // clang-format off
+  // Assert `tril` works with a batch of tensors, where dims are unequal.
+  template <ScalarType DTYPE>
+  void test_tril_out_multi_unequal_dim() {
+    TensorFactory<DTYPE> tf;
+
+    // clang-format offF
+    Tensor self = tf.make(
+        /*sizes=*/{3, 2, 3},
+        /*data=*/
+        {
+            1,
+            1,
+            1, // tensor([[[ 1,  1,  1],
+            1,
+            1,
+            1, //          [ 1,  1,  1]],
+
+            1,
+            1,
+            1, //         [[ 1,  1,  1],
+            1,
+            1,
+            1, //          [ 1,  1,  1]],
+
+            1,
+            1,
+            1, //         [[ 1,  1,  1],
+            1,
+            1,
+            1, //          [ 1,  1,  1]]])
+        });
+    // clang-format on
+
+    Tensor out = tf.zeros({3, 2, 3});
+
+    op_tril_out(self, 0, out);
+
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 2, 3},
     /*data=*/
@@ -351,17 +363,17 @@ void test_tril_out_multi_unequal_dim() {
         1,  1,  0, //          [ 1,  1,  0]]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `tril` works with non-0/1 values on regular diagonal.
-template <ScalarType DTYPE>
-void test_tril_out_arange_reg_diag() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works with non-0/1 values on regular diagonal.
+  template <ScalarType DTYPE>
+  void test_tril_out_arange_reg_diag() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -371,13 +383,13 @@ void test_tril_out_arange_reg_diag() {
         7,  8,  9, //         [ 7,  8,  9]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.zeros({3, 3});
+    Tensor out = tf.zeros({3, 3});
 
-  op_tril_out(self, 0, out);
+    op_tril_out(self, 0, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -387,20 +399,20 @@ void test_tril_out_arange_reg_diag() {
         7,  8,  9, //         [ 7,  8,  9]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `tril` works with non-0/1 values on positive diagonal values.
-// An edge case with a far-out positive diagonal is also included.
-template <ScalarType DTYPE>
-void test_tril_out_arange_pos_diag() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works with non-0/1 values on positive diagonal values.
+  // An edge case with a far-out positive diagonal is also included.
+  template <ScalarType DTYPE>
+  void test_tril_out_arange_pos_diag() {
+    TensorFactory<DTYPE> tf;
 
-  // Case: diag = 1
+    // Case: diag = 1
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -410,13 +422,13 @@ void test_tril_out_arange_pos_diag() {
         7,  8,  9, //         [ 7,  8,  9]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out1 = tf.zeros({3, 3});
+    Tensor out1 = tf.zeros({3, 3});
 
-  op_tril_out(self, 1, out1);
+    op_tril_out(self, 1, out1);
 
-  // clang-format off
+    // clang-format off
   Tensor result1 = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -426,32 +438,32 @@ void test_tril_out_arange_pos_diag() {
         7,  8,  9, //         [ 7,  8,  9]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out1, result1);
+    EXPECT_TENSOR_EQ(out1, result1);
 
-  // Case: diag = 2
+    // Case: diag = 2
 
-  Tensor out2 = tf.zeros({3, 3});
-  op_tril_out(self, 2, out2);
-  EXPECT_TENSOR_EQ(out2, self);
+    Tensor out2 = tf.zeros({3, 3});
+    op_tril_out(self, 2, out2);
+    EXPECT_TENSOR_EQ(out2, self);
 
-  // Case: diag = 10
+    // Case: diag = 10
 
-  Tensor out3 = tf.zeros({3, 3});
-  op_tril_out(self, 10, out3);
-  EXPECT_TENSOR_EQ(out3, self);
-}
+    Tensor out3 = tf.zeros({3, 3});
+    op_tril_out(self, 10, out3);
+    EXPECT_TENSOR_EQ(out3, self);
+  }
 
-// Assert `tril` works with non-0/1 values on negative diagonal values.
-// An edge case with a far-out negative diagonal is also included.
-template <ScalarType DTYPE>
-void test_tril_out_arange_neg_diag() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works with non-0/1 values on negative diagonal values.
+  // An edge case with a far-out negative diagonal is also included.
+  template <ScalarType DTYPE>
+  void test_tril_out_arange_neg_diag() {
+    TensorFactory<DTYPE> tf;
 
-  // Case: diag = -1
+    // Case: diag = -1
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -461,13 +473,13 @@ void test_tril_out_arange_neg_diag() {
         7,  8,  9, //         [ 7,  8,  9]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out1 = tf.zeros({3, 3});
+    Tensor out1 = tf.zeros({3, 3});
 
-  op_tril_out(self, -1, out1);
+    op_tril_out(self, -1, out1);
 
-  // clang-format off
+    // clang-format off
   Tensor result1 = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -477,17 +489,17 @@ void test_tril_out_arange_neg_diag() {
         7,  8,  0, //         [ 7,  8,  0]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out1, result1);
+    EXPECT_TENSOR_EQ(out1, result1);
 
-  // Case: diag = 2
+    // Case: diag = 2
 
-  Tensor out2 = tf.zeros({3, 3});
+    Tensor out2 = tf.zeros({3, 3});
 
-  op_tril_out(self, -2, out2);
+    op_tril_out(self, -2, out2);
 
-  // clang-format off
+    // clang-format off
   Tensor result2 = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -497,17 +509,17 @@ void test_tril_out_arange_neg_diag() {
         7,  0,  0, //         [ 7,  0,  0]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out2, result2);
+    EXPECT_TENSOR_EQ(out2, result2);
 
-  // Case: diag = 10
+    // Case: diag = 10
 
-  Tensor out3 = tf.zeros({3, 3});
+    Tensor out3 = tf.zeros({3, 3});
 
-  op_tril_out(self, -10, out3);
+    op_tril_out(self, -10, out3);
 
-  // clang-format off
+    // clang-format off
   Tensor result3 = tf.make(
     /*sizes=*/{3, 3},
     /*data=*/
@@ -517,18 +529,18 @@ void test_tril_out_arange_neg_diag() {
         0,  0,  0, //         [ 0,  0,  0]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out3, result3);
-}
+    EXPECT_TENSOR_EQ(out3, result3);
+  }
 
-// Assert `tril` works on a batch of tensors with random integers, where dims
-// are equal.
-template <ScalarType DTYPE>
-void test_tril_out_randint_multi_equal() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works on a batch of tensors with random integers, where dims
+  // are equal.
+  template <ScalarType DTYPE>
+  void test_tril_out_randint_multi_equal() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 3, 3, 3},
     /*data=*/
@@ -570,13 +582,13 @@ void test_tril_out_randint_multi_equal() {
         5,  2,  2, //           [ 5,  2,  2]]]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.zeros({3, 3, 3, 3});
+    Tensor out = tf.zeros({3, 3, 3, 3});
 
-  op_tril_out(self, 0, out);
+    op_tril_out(self, 0, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 3, 3, 3},
     /*data=*/
@@ -618,18 +630,18 @@ void test_tril_out_randint_multi_equal() {
         5,  2,  2, //           [ 5,  2,  2]]]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
 
-// Assert `tril` works on a batch of tensors with random integers, where dims
-// are unequal.
-template <ScalarType DTYPE>
-void test_tril_out_randint_multi_unequal() {
-  TensorFactory<DTYPE> tf;
+  // Assert `tril` works on a batch of tensors with random integers, where dims
+  // are unequal.
+  template <ScalarType DTYPE>
+  void test_tril_out_randint_multi_unequal() {
+    TensorFactory<DTYPE> tf;
 
-  // clang-format off
+    // clang-format off
   Tensor self = tf.make(
     /*sizes=*/{3, 2, 3, 2},
     /*data=*/
@@ -659,13 +671,13 @@ void test_tril_out_randint_multi_unequal() {
         1,  6, //           [ 1,  6]]]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  Tensor out = tf.zeros({3, 2, 3, 2});
+    Tensor out = tf.zeros({3, 2, 3, 2});
 
-  op_tril_out(self, 0, out);
+    op_tril_out(self, 0, out);
 
-  // clang-format off
+    // clang-format off
   Tensor result = tf.make(
     /*sizes=*/{3, 2, 3, 2},
     /*data=*/
@@ -695,14 +707,15 @@ void test_tril_out_randint_multi_unequal() {
         1,  6, //           [ 1,  6]]]])
     }
   );
-  // clang-format on
+    // clang-format on
 
-  EXPECT_TENSOR_EQ(out, result);
-}
+    EXPECT_TENSOR_EQ(out, result);
+  }
+};
 
 // Create generic tests for all dtypes. Tensors contain 0s or 1s.
 #define GENERATE_GENERIC_TEST(_, DTYPE)                   \
-  TEST(OpTrilTest, DTYPE##GenericTest) {                  \
+  TEST_F(OpTrilTest, DTYPE##GenericTest) {                \
     test_tril_out_zeros<ScalarType::DTYPE>();             \
     test_tril_out_ones<ScalarType::DTYPE>();              \
     test_tril_out_empty_dims<ScalarType::DTYPE>();        \
@@ -718,7 +731,7 @@ ET_FORALL_REAL_TYPES_AND(Bool, GENERATE_GENERIC_TEST)
 
 // Create generic tests for real dtypes. Tensors have diverse values.
 #define GENERATE_REAL_TEST(_, DTYPE)                          \
-  TEST(OpTrilTest, DTYPE##RealTest) {                         \
+  TEST_F(OpTrilTest, DTYPE##RealTest) {                       \
     test_tril_out_arange_pos_diag<ScalarType::DTYPE>();       \
     test_tril_out_arange_neg_diag<ScalarType::DTYPE>();       \
     test_tril_out_randint_multi_equal<ScalarType::DTYPE>();   \
@@ -727,7 +740,7 @@ ET_FORALL_REAL_TYPES_AND(Bool, GENERATE_GENERIC_TEST)
 
 ET_FORALL_REAL_TYPES(GENERATE_REAL_TEST)
 
-TEST(OpTrilTest, InvalidInputShapesDies) {
+TEST_F(OpTrilTest, InvalidInputShapesDies) {
   TensorFactory<ScalarType::Int> tf;
 
   // `self` and `out` invalid shapes: ndims = 0 is <2.
@@ -735,17 +748,17 @@ TEST(OpTrilTest, InvalidInputShapesDies) {
   Tensor out1 = tf.zeros({});
 
   // Assert `out` can't be filled due to incompatible shapes.
-  ET_EXPECT_KERNEL_FAILURE(op_tril_out(self1, 0, out1));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tril_out(self1, 0, out1));
 
   // `self` and `out` invalid shapes: ndims = 1 is <2.
   Tensor self2 = tf.zeros({1});
   Tensor out2 = tf.zeros({1});
 
   // Assert `out` can't be filled due to incompatible shapes.
-  ET_EXPECT_KERNEL_FAILURE(op_tril_out(self2, 0, out2));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tril_out(self2, 0, out2));
 }
 
-TEST(OpTrilTest, MismatchedOutputShapesDies) {
+TEST_F(OpTrilTest, MismatchedOutputShapesDies) {
   // Skip ATen test since it supports `self` and `out` having different shapes.
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched output shape";
@@ -758,10 +771,10 @@ TEST(OpTrilTest, MismatchedOutputShapesDies) {
   Tensor out = tf.zeros({2, 2});
 
   // Assert `out` can't be filled due to incompatible shapes.
-  ET_EXPECT_KERNEL_FAILURE(op_tril_out(self, 0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tril_out(self, 0, out));
 }
 
-TEST(OpTrilTest, MismatchedOutputDtypeDies) {
+TEST_F(OpTrilTest, MismatchedOutputDtypeDies) {
   TensorFactory<ScalarType::Byte> tf_byte;
   TensorFactory<ScalarType::Float> tf_float;
 
@@ -770,10 +783,10 @@ TEST(OpTrilTest, MismatchedOutputDtypeDies) {
   Tensor out = tf_float.zeros({2, 2});
 
   // Assert `out` can't be filled due to incompatible dtype.
-  ET_EXPECT_KERNEL_FAILURE(op_tril_out(self, 0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tril_out(self, 0, out));
 }
 
-TEST(OpTrilTest, InvalidTensorDims) {
+TEST_F(OpTrilTest, InvalidTensorDims) {
   // Skip ATen test since it supports `self` and `out` having different shapes.
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched output shape";
@@ -787,5 +800,5 @@ TEST(OpTrilTest, InvalidTensorDims) {
   Tensor out = tf.zeros(sizes);
 
   // Assert `out` can't be filled due to too many tensor dims.
-  ET_EXPECT_KERNEL_FAILURE(op_tril_out(self, 0, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_tril_out(self, 0, out));
 }
diff --git a/kernels/test/op_unbind_copy_test.cpp b/kernels/test/op_unbind_copy_test.cpp
index d1bf7f44c7..3e861e29a7 100644
--- a/kernels/test/op_unbind_copy_test.cpp
+++ b/kernels/test/op_unbind_copy_test.cpp
@@ -24,165 +24,207 @@ using exec_aten::TensorList;
 using torch::executor::testing::TensorFactory;
 using torch::executor::testing::TensorListFactory;
 
-void op_unbind_copy_int_out(const Tensor& self, int64_t dim, TensorList out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::unbind_copy_outf(context, self, dim, out);
-}
+class OpUnbindCopyIntOutTest : public OperatorTest {
+ protected:
+  void op_unbind_copy_int_out(const Tensor& self, int64_t dim, TensorList out) {
+    return torch::executor::aten::unbind_copy_outf(context_, self, dim, out);
+  }
+
+  template <ScalarType DTYPE>
+  Tensor make1x2x3(TensorFactory<DTYPE>& tf) {
+    // clang-format off
+    return tf.make(
+        /*sizes=*/{1, 2, 3},
+        /*data=*/
+        {
+             0,  1,  2, // tensor([[[ 0,  1,  2],
+             3,  4,  5, //          [ 3,  4,  5]]])
+        });
+    // clang-format on
+  }
+
+  template <ScalarType DTYPE>
+  void test_unbind_dim0() {
+    TensorFactory<DTYPE> tf;
+    TensorListFactory<DTYPE> tlf;
+
+    // clang-format off
+    std::vector<Tensor> expected_out = {
+        tf.make(
+            /*sizes=*/{2, 3},
+            /*data=*/
+            {
+                 0,  1,  2, // tensor([[ 0,  1,  2],
+                 3,  4,  5, //         [ 3,  4,  5]])
+            }),
+    };
+    // clang-format on
+
+    Tensor input = make1x2x3(tf);
+
+    // Output list with the same shapes/dtypes as the expected outputs.
+    TensorList out = tlf.zeros_like(expected_out);
+
+    op_unbind_copy_int_out(input, /*dim=*/0, out);
+
+    EXPECT_TENSOR_LISTS_EQ(expected_out, out);
+
+    // Also show that python negative indexing works for this case.
+    TensorList out2 = tlf.zeros_like(expected_out);
+    op_unbind_copy_int_out(input, /*dim=*/-3, out2);
+    EXPECT_TENSOR_LISTS_EQ(expected_out, out2);
+  }
+
+  template <ScalarType DTYPE>
+  void test_unbind_dim1() {
+    TensorFactory<DTYPE> tf;
+    TensorListFactory<DTYPE> tlf;
+
+    // clang-format off
+    std::vector<Tensor> expected_out = {
+        tf.make(
+            /*sizes=*/{1, 3},
+            /*data=*/
+            {
+                 0,  1,  2, // tensor([[ 0,  1,  2]])
+            }),
+        tf.make(
+            /*sizes=*/{1, 3},
+            /*data=*/
+            {
+                 3,  4,  5, // tensor([[ 3,  4,  5]])
+            }),
+    };
+    // clang-format on
+
+    Tensor input = make1x2x3(tf);
+
+    // Output list with the same shapes/dtypes as the expected outputs.
+    TensorList out = tlf.zeros_like(expected_out);
+
+    op_unbind_copy_int_out(input, /*dim=*/1, out);
+
+    EXPECT_TENSOR_LISTS_EQ(expected_out, out);
+
+    // Also show that python negative indexing works for this case.
+    TensorList out2 = tlf.zeros_like(expected_out);
+    op_unbind_copy_int_out(input, /*dim=*/-2, out2);
+    EXPECT_TENSOR_LISTS_EQ(expected_out, out2);
+  }
+
+  template <ScalarType DTYPE>
+  void test_unbind_dim2() {
+    TensorFactory<DTYPE> tf;
+    TensorListFactory<DTYPE> tlf;
+
+    // Splitting on dim=N with split_size=2 will produce a list of tensors where
+    // the max dim[N] is 2, and the other dims are the same as the input.
+
+    // clang-format off
+    std::vector<Tensor> expected_out = {
+        tf.make(
+            /*sizes=*/{1, 2},
+            /*data=*/
+            {
+                 0, // tensor([[ 0,
+                 3, //           3]]),
+            }),
+        tf.make(
+            /*sizes=*/{1, 2},
+            /*data=*/
+            {
+                 1, // tensor([[ 1,
+                 4, //           4]]),
+            }),
+        tf.make(
+            /*sizes=*/{1, 2},
+            /*data=*/
+            {
+                 2, // tensor([[ 2,
+                 5, //           5]]),
+            }),
+    };
+    // clang-format on
+
+    Tensor input = make1x2x3(tf);
+
+    // Output list with the same shapes/dtypes as the expected outputs.
+    TensorList out = tlf.zeros_like(expected_out);
+
+    op_unbind_copy_int_out(input, /*dim=*/2, out);
+
+    EXPECT_TENSOR_LISTS_EQ(expected_out, out);
+
+    // Also show that python negative indexing works for this case.
+    TensorList out2 = tlf.zeros_like(expected_out);
+    op_unbind_copy_int_out(input, /*dim=*/-1, out2);
+    EXPECT_TENSOR_LISTS_EQ(expected_out, out2);
+  }
+
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  x = torch.randint(10, (2, 3, 4))
+  res = torch.unbind(x, 1)
+  op = "op_unbind_copy_int_out"
+  opt_extra_params = "1,"
+  out_args = [
+    "out_shape, dynamism",
+    "out_shape, dynamism",
+    "out_shape, dynamism"
+  ]
+  dtype = "ScalarType::Int"
+  check = "EXPECT_TENSOR_LISTS_EQ" */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(unary_op_tensor_list_out) */
+
+    TensorFactory<ScalarType::Int> tf;
+
+    Tensor x = tf.make({2, 3, 4}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6,
+                                   6, 9, 8, 6, 6, 8, 4, 3, 6, 9, 1, 4});
+    std::vector<Tensor> expectedv = {
+        tf.make({2, 4}, {4, 9, 3, 0, 6, 9, 8, 6}),
+        tf.make({2, 4}, {3, 9, 7, 3, 6, 8, 4, 3}),
+        tf.make({2, 4}, {7, 3, 1, 6, 6, 9, 1, 4})};
+    TensorList expected(expectedv.data(), expectedv.size());
+
+    std::vector<Tensor> outv = {
+        tf.zeros(out_shape, dynamism),
+        tf.zeros(out_shape, dynamism),
+        tf.zeros(out_shape, dynamism)};
+    TensorList out(outv.data(), outv.size());
+    op_unbind_copy_int_out(x, 1, out);
+    EXPECT_TENSOR_LISTS_EQ(out, expected);
+  }
+};
 
 /**
  * Returns a 1x2x3 contiguous tensor where the underlying data counts from 0 to
  * 26.
  */
-template <ScalarType DTYPE>
-Tensor make1x2x3(TensorFactory<DTYPE>& tf) {
-  // clang-format off
-  return tf.make(
-      /*sizes=*/{1, 2, 3},
-      /*data=*/
-      {
-           0,  1,  2, // tensor([[[ 0,  1,  2],
-           3,  4,  5, //          [ 3,  4,  5]]])
-      });
-  // clang-format on
-}
-
-template <ScalarType DTYPE>
-void test_unbind_dim0() {
-  TensorFactory<DTYPE> tf;
-  TensorListFactory<DTYPE> tlf;
-
-  // clang-format off
-  std::vector<Tensor> expected_out = {
-      tf.make(
-          /*sizes=*/{2, 3},
-          /*data=*/
-          {
-               0,  1,  2, // tensor([[ 0,  1,  2],
-               3,  4,  5, //         [ 3,  4,  5]])
-          }),
-  };
-  // clang-format on
-
-  Tensor input = make1x2x3(tf);
-
-  // Output list with the same shapes/dtypes as the expected outputs.
-  TensorList out = tlf.zeros_like(expected_out);
-
-  op_unbind_copy_int_out(input, /*dim=*/0, out);
-
-  EXPECT_TENSOR_LISTS_EQ(expected_out, out);
-
-  // Also show that python negative indexing works for this case.
-  TensorList out2 = tlf.zeros_like(expected_out);
-  op_unbind_copy_int_out(input, /*dim=*/-3, out2);
-  EXPECT_TENSOR_LISTS_EQ(expected_out, out2);
-}
-
-template <ScalarType DTYPE>
-void test_unbind_dim1() {
-  TensorFactory<DTYPE> tf;
-  TensorListFactory<DTYPE> tlf;
-
-  // clang-format off
-  std::vector<Tensor> expected_out = {
-      tf.make(
-          /*sizes=*/{1, 3},
-          /*data=*/
-          {
-               0,  1,  2, // tensor([[ 0,  1,  2]])
-          }),
-      tf.make(
-          /*sizes=*/{1, 3},
-          /*data=*/
-          {
-               3,  4,  5, // tensor([[ 3,  4,  5]])
-          }),
-  };
-  // clang-format on
-
-  Tensor input = make1x2x3(tf);
-
-  // Output list with the same shapes/dtypes as the expected outputs.
-  TensorList out = tlf.zeros_like(expected_out);
-
-  op_unbind_copy_int_out(input, /*dim=*/1, out);
-
-  EXPECT_TENSOR_LISTS_EQ(expected_out, out);
-
-  // Also show that python negative indexing works for this case.
-  TensorList out2 = tlf.zeros_like(expected_out);
-  op_unbind_copy_int_out(input, /*dim=*/-2, out2);
-  EXPECT_TENSOR_LISTS_EQ(expected_out, out2);
-}
-
-template <ScalarType DTYPE>
-void test_unbind_dim2() {
-  TensorFactory<DTYPE> tf;
-  TensorListFactory<DTYPE> tlf;
-
-  // Splitting on dim=N with split_size=2 will produce a list of tensors where
-  // the max dim[N] is 2, and the other dims are the same as the input.
-
-  // clang-format off
-  std::vector<Tensor> expected_out = {
-      tf.make(
-          /*sizes=*/{1, 2},
-          /*data=*/
-          {
-               0, // tensor([[ 0,
-               3, //           3]]),
-          }),
-      tf.make(
-          /*sizes=*/{1, 2},
-          /*data=*/
-          {
-               1, // tensor([[ 1,
-               4, //           4]]),
-          }),
-      tf.make(
-          /*sizes=*/{1, 2},
-          /*data=*/
-          {
-               2, // tensor([[ 2,
-               5, //           5]]),
-          }),
-  };
-  // clang-format on
-
-  Tensor input = make1x2x3(tf);
-
-  // Output list with the same shapes/dtypes as the expected outputs.
-  TensorList out = tlf.zeros_like(expected_out);
-
-  op_unbind_copy_int_out(input, /*dim=*/2, out);
-
-  EXPECT_TENSOR_LISTS_EQ(expected_out, out);
-
-  // Also show that python negative indexing works for this case.
-  TensorList out2 = tlf.zeros_like(expected_out);
-  op_unbind_copy_int_out(input, /*dim=*/-1, out2);
-  EXPECT_TENSOR_LISTS_EQ(expected_out, out2);
-}
-
-TEST(OpUnbindCopyIntOutTest, Unbind1x2x3OnDim0AllRealDtypes) {
+TEST_F(OpUnbindCopyIntOutTest, Unbind1x2x3OnDim0AllRealDtypes) {
 #define TEST_ENTRY(ctype, dtype) test_unbind_dim0<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpUnbindCopyIntOutTest, Unbind1x2x3OnDim1AllRealDTypes) {
+TEST_F(OpUnbindCopyIntOutTest, Unbind1x2x3OnDim1AllRealDTypes) {
 #define TEST_ENTRY(ctype, dtype) test_unbind_dim1<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpUnbindCopyIntOutTest, Unbind1x2x3OnDim2AllRealDTypes) {
+TEST_F(OpUnbindCopyIntOutTest, Unbind1x2x3OnDim2AllRealDTypes) {
 #define TEST_ENTRY(ctype, dtype) test_unbind_dim2<ScalarType::dtype>();
   ET_FORALL_REAL_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpUnbindCopyIntOutTest, ZeroDimensionalInputTensorDies) {
+TEST_F(OpUnbindCopyIntOutTest, ZeroDimensionalInputTensorDies) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -190,10 +232,11 @@ TEST(OpUnbindCopyIntOutTest, ZeroDimensionalInputTensorDies) {
   // Arbitrary output shape since this input can't be split.
   TensorList out = tlf.zeros_like({input});
 
-  ET_EXPECT_KERNEL_FAILURE(op_unbind_copy_int_out(input, /*dim=*/0, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_unbind_copy_int_out(input, /*dim=*/0, out));
 }
 
-TEST(OpUnbindCopyIntOutTest, UnbindWorksWithZeroSizedTensors) {
+TEST_F(OpUnbindCopyIntOutTest, UnbindWorksWithZeroSizedTensors) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -225,7 +268,7 @@ TEST(OpUnbindCopyIntOutTest, UnbindWorksWithZeroSizedTensors) {
   EXPECT_TENSOR_LISTS_EQ(out, expected_out);
 }
 
-TEST(OpUnbindCopyIntOutTest, UnbindFailsWithWronglyAllocatedOutput) {
+TEST_F(OpUnbindCopyIntOutTest, UnbindFailsWithWronglyAllocatedOutput) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -237,24 +280,27 @@ TEST(OpUnbindCopyIntOutTest, UnbindFailsWithWronglyAllocatedOutput) {
   TensorList out = tlf.zeros_like(expected_out);
 
   // Die because length of the list should be 2
-  ET_EXPECT_KERNEL_FAILURE(op_unbind_copy_int_out(input, /*dim=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_unbind_copy_int_out(input, /*dim=*/1, out));
 
   expected_out = {tf.ones({1, 4}), tf.ones({1, 4})};
 
   out = tlf.zeros_like(expected_out);
 
   // Die because output tensors in the list should be of correct sizes
-  ET_EXPECT_KERNEL_FAILURE(op_unbind_copy_int_out(input, /*dim=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_unbind_copy_int_out(input, /*dim=*/1, out));
 
   expected_out = {tf.ones({1}), tf.ones({1})};
 
   out = tlf.zeros_like(expected_out);
 
   // Die because output tensors in the list should have correct number of dims
-  ET_EXPECT_KERNEL_FAILURE(op_unbind_copy_int_out(input, /*dim=*/1, out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_unbind_copy_int_out(input, /*dim=*/1, out));
 }
 
-TEST(OpUnbindCopyIntOutTest, UnbindProduceScalarTensors) {
+TEST_F(OpUnbindCopyIntOutTest, UnbindProduceScalarTensors) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -274,7 +320,7 @@ TEST(OpUnbindCopyIntOutTest, UnbindProduceScalarTensors) {
   EXPECT_TENSOR_LISTS_EQ(out, expected_out);
 }
 
-TEST(OpUnbindCopyIntOutTest, UnbindProduceScalarLikeTensors) {
+TEST_F(OpUnbindCopyIntOutTest, UnbindProduceScalarLikeTensors) {
   TensorFactory<ScalarType::Int> tf;
   TensorListFactory<ScalarType::Int> tlf;
 
@@ -309,58 +355,18 @@ TEST(OpUnbindCopyIntOutTest, UnbindProduceScalarLikeTensors) {
   EXPECT_TENSOR_LISTS_EQ(out, expected_out);
 }
 
-/* %python
-import torch
-torch.manual_seed(0)
-x = torch.randint(10, (2, 3, 4))
-res = torch.unbind(x, 1)
-op = "op_unbind_copy_int_out"
-opt_extra_params = "1,"
-out_args = [
-  "out_shape, dynamism",
-  "out_shape, dynamism",
-  "out_shape, dynamism"
-]
-dtype = "ScalarType::Int"
-check = "EXPECT_TENSOR_LISTS_EQ" */
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(unary_op_tensor_list_out) */
-
-  TensorFactory<ScalarType::Int> tf;
-
-  Tensor x = tf.make({2, 3, 4}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6,
-                                 6, 9, 8, 6, 6, 8, 4, 3, 6, 9, 1, 4});
-  std::vector<Tensor> expectedv = {
-      tf.make({2, 4}, {4, 9, 3, 0, 6, 9, 8, 6}),
-      tf.make({2, 4}, {3, 9, 7, 3, 6, 8, 4, 3}),
-      tf.make({2, 4}, {7, 3, 1, 6, 6, 9, 1, 4})};
-  TensorList expected(expectedv.data(), expectedv.size());
-
-  std::vector<Tensor> outv = {
-      tf.zeros(out_shape, dynamism),
-      tf.zeros(out_shape, dynamism),
-      tf.zeros(out_shape, dynamism)};
-  TensorList out(outv.data(), outv.size());
-  op_unbind_copy_int_out(x, 1, out);
-  EXPECT_TENSOR_LISTS_EQ(out, expected);
-}
-
-TEST(OpUnbindCopyIntOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpUnbindCopyIntOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpUnbindCopyIntOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpUnbindCopyIntOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   GTEST_SKIP() << "Dynamic shape not supported";
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpUnbindCopyIntOutTest, DynamicShapeUnbound) {
+TEST_F(OpUnbindCopyIntOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   test_dynamic_shape(
       {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
diff --git a/kernels/test/op_unsqueeze_copy_test.cpp b/kernels/test/op_unsqueeze_copy_test.cpp
index 8eddcc732a..9aa7845591 100644
--- a/kernels/test/op_unsqueeze_copy_test.cpp
+++ b/kernels/test/op_unsqueeze_copy_test.cpp
@@ -23,107 +23,106 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_unsqueeze_copy_out(const Tensor& self, int64_t dim, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::unsqueeze_copy_outf(context, self, dim, out);
-}
+class OpUnsqueezeTest : public OperatorTest {
+ protected:
+  Tensor& op_unsqueeze_copy_out(const Tensor& self, int64_t dim, Tensor& out) {
+    return torch::executor::aten::unsqueeze_copy_outf(context_, self, dim, out);
+  }
 
-namespace {
+  template <class CTYPE, ScalarType DTYPE>
+  void run_unsqueeze_test_cases(
+      const Tensor& input,
+      const std::vector<int64_t>& dims) {
+    TensorFactory<DTYPE> tf;
 
-// generate size of output based on input size and dim to be unsqueezed on.
-std::vector<int32_t> generate_size_out(
-#ifdef USE_ATEN_LIB
-    const c10::IntArrayRef& size_in,
-#else
-    const exec_aten::ArrayRef<int32_t>& size_in,
-#endif
-    int64_t dim) {
-  std::vector<int32_t> size_out(size_in.size() + 1);
+    // DEBUG
+    et_pal_init();
 
-  // Support python-style negative indexing.
-  if (dim < 0) {
-    // Since we do not have out.dim() directly, calculate it from the input.
-    dim += size_in.size() + 1;
-  }
-  EXPECT_GE(dim, 0);
-  EXPECT_LT(dim, size_in.size() + 1);
-
-  for (int32_t i = 0; i <= size_in.size(); i++) {
-    if (i < dim) {
-      size_out[i] = size_in[i];
-    } else if (i > dim) {
-      size_out[i] = size_in[i - 1];
-    } else { // i == dim
-      size_out[dim] = 1;
+    for (int64_t dim : dims) {
+      std::vector<int32_t> size_out = generate_size_out(input.sizes(), dim);
+      Tensor out = tf.ones(size_out);
+      Tensor ret = op_unsqueeze_copy_out(input, dim, out);
+
+      // The following is just a check against itself.
+      EXPECT_TENSOR_EQ(out, ret);
+      EXPECT_TENSOR_DATA_EQ(input, out);
     }
   }
 
-  return size_out;
-}
+  // test if op_unsqueeze_copy_out works well under all kinds of legal input
+  // type.
+  template <class CTYPE, ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+    Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{0, 1, 1, 1, 0, 1, 0, 1});
 
-template <class CTYPE, ScalarType DTYPE>
-void run_unsqueeze_test_cases(
-    const Tensor& input,
-    const std::vector<int64_t>& dims) {
-  TensorFactory<DTYPE> tf;
+    // All valid dims given the shape of the input
+    // Legal dim for unsqueeze should be in [-(input.dim()+1), input.dim()]
+    // Here input.dim == 2, so the range of legal dim for unsqueeze is [-3, 2]
+    std::vector<int64_t> dims = {-3, -2, -1, 0, 1, 2};
 
-  // DEBUG
-  et_pal_init();
+    run_unsqueeze_test_cases<CTYPE, DTYPE>(input, dims);
+  }
 
-  for (int64_t dim : dims) {
-    std::vector<int32_t> size_out = generate_size_out(input.sizes(), dim);
-    Tensor out = tf.ones(size_out);
-    Tensor ret = op_unsqueeze_copy_out(input, dim, out);
+  template <class CTYPE, ScalarType DTYPE>
+  void test_empty_input() {
+    TensorFactory<DTYPE> tf;
+    Tensor input = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
 
-    // The following is just a check against itself.
-    EXPECT_TENSOR_EQ(out, ret);
-    EXPECT_TENSOR_DATA_EQ(input, out);
-  }
-}
+    // All valid dims given the shape of the input
+    // Legal dim for unsqueeze should be in [-(input.dim()+1), input.dim()]
+    // Here input.dim == 4, so the range of legal dim for unsqueeze is [-5, 4]
+    std::vector<int64_t> dims = {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
 
-} // namespace
+    run_unsqueeze_test_cases<CTYPE, DTYPE>(input, dims);
+  }
 
-// regular test for op_unsqueeze_copy_out
-// test if op_unsqueeze_copy_out works well under all kinds of legal input type.
-template <class CTYPE, ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-  Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{0, 1, 1, 1, 0, 1, 0, 1});
+  // generate size of output based on input size and dim to be unsqueezed on.
+  std::vector<int32_t> generate_size_out(
+#ifdef USE_ATEN_LIB
+      const c10::IntArrayRef& size_in,
+#else
+      const exec_aten::ArrayRef<int32_t>& size_in,
+#endif
+      int64_t dim) {
+    std::vector<int32_t> size_out(size_in.size() + 1);
 
-  // All valid dims given the shape of the input
-  // Legal dim for unsqueeze should be in [-(input.dim()+1), input.dim()]
-  // Here input.dim == 2, so the range of legal dim for unsqueeze is [-3, 2]
-  std::vector<int64_t> dims = {-3, -2, -1, 0, 1, 2};
+    // Support python-style negative indexing.
+    if (dim < 0) {
+      // Since we do not have out.dim() directly, calculate it from the input.
+      dim += size_in.size() + 1;
+    }
+    EXPECT_GE(dim, 0);
+    EXPECT_LT(dim, size_in.size() + 1);
+
+    for (int32_t i = 0; i <= size_in.size(); i++) {
+      if (i < dim) {
+        size_out[i] = size_in[i];
+      } else if (i > dim) {
+        size_out[i] = size_in[i - 1];
+      } else { // i == dim
+        size_out[dim] = 1;
+      }
+    }
 
-  run_unsqueeze_test_cases<CTYPE, DTYPE>(input, dims);
-}
+    return size_out;
+  }
+};
 
-TEST(OpUnsqueezeTest, AllDtypesSupported) {
+// regular test for op_unsqueeze_copy_out
+TEST_F(OpUnsqueezeTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-template <class CTYPE, ScalarType DTYPE>
-void test_empty_input() {
-  TensorFactory<DTYPE> tf;
-  Tensor input = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
-
-  // All valid dims given the shape of the input
-  // Legal dim for unsqueeze should be in [-(input.dim()+1), input.dim()]
-  // Here input.dim == 4, so the range of legal dim for unsqueeze is [-5, 4]
-  std::vector<int64_t> dims = {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4};
-
-  run_unsqueeze_test_cases<CTYPE, DTYPE>(input, dims);
-}
-
-TEST(OpUnsqueezeTest, EmptyInputSupported) {
+TEST_F(OpUnsqueezeTest, EmptyInputSupported) {
 #define TEST_ENTRY(ctype, dtype) test_empty_input<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpUnsqueezeTest, InputOutputMismatchedSizesDie) {
+TEST_F(OpUnsqueezeTest, InputOutputMismatchedSizesDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched sizes";
   }
@@ -134,12 +133,12 @@ TEST(OpUnsqueezeTest, InputOutputMismatchedSizesDie) {
 
   // unsqueese input on dim 1 should get tensor(3, 1, 1, 2)
   Tensor out = tf.ones(/*sizes=*/{3, 1, 1, 1});
-  ET_EXPECT_KERNEL_FAILURE(op_unsqueeze_copy_out(input, dim, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_unsqueeze_copy_out(input, dim, out));
   out = tf.ones(/*sizes=*/{3, 1, 1, 2, 1});
-  ET_EXPECT_KERNEL_FAILURE(op_unsqueeze_copy_out(input, dim, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_unsqueeze_copy_out(input, dim, out));
 }
 
-TEST(OpUnsqueezeTest, DimOutputMismatchedSizesDie) {
+TEST_F(OpUnsqueezeTest, DimOutputMismatchedSizesDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched sizes";
   }
@@ -149,20 +148,20 @@ TEST(OpUnsqueezeTest, DimOutputMismatchedSizesDie) {
   int64_t dim = 2;
 
   // The size of output should be [3,1,1,2], not [3,1,2,1], since dim=2 not 3
-  ET_EXPECT_KERNEL_FAILURE(op_unsqueeze_copy_out(input, dim, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_unsqueeze_copy_out(input, dim, out));
 }
 
-TEST(OpUnsqueezeTest, MismatchedTypesDie) {
+TEST_F(OpUnsqueezeTest, MismatchedTypesDie) {
   TensorFactory<ScalarType::Int> tf_in;
   TensorFactory<ScalarType::Double> tf_out;
   Tensor input = tf_in.make(/*sizes=*/{3, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf_out.ones(/*sizes=*/{3, 1, 2, 1});
   int64_t dim = 3;
 
-  ET_EXPECT_KERNEL_FAILURE(op_unsqueeze_copy_out(input, dim, out));
+  ET_EXPECT_KERNEL_FAILURE(context_, op_unsqueeze_copy_out(input, dim, out));
 }
 
-TEST(OpUnsqueezeTest, DimOutOfRangeDies) {
+TEST_F(OpUnsqueezeTest, DimOutOfRangeDies) {
   TensorFactory<ScalarType::Int> tf;
   Tensor input = tf.make(/*sizes=*/{1, 1, 1}, /*data=*/{1});
   Tensor out = tf.ones(/*sizes=*/{1, 1, 1, 1});
@@ -178,12 +177,13 @@ TEST(OpUnsqueezeTest, DimOutOfRangeDies) {
   }
 
   for (auto dim : illegal_dims) {
-    ET_EXPECT_KERNEL_FAILURE(op_unsqueeze_copy_out(input, dim, out));
+    ET_LOG(Info, "Checking dim %ld", dim);
+    ET_EXPECT_KERNEL_FAILURE(context_, op_unsqueeze_copy_out(input, dim, out));
   }
 }
 
 #ifndef USE_ATEN_LIB
-TEST(OpUnsqueezeTest, UpperBoundOutTensor) {
+TEST_F(OpUnsqueezeTest, UpperBoundOutTensor) {
   TensorFactory<ScalarType::Float> tf;
   Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{0, 1, 1, 1, 0, 1, 0, 1});
   Tensor out =
@@ -229,7 +229,7 @@ opt_extra_params = "1,"
 dtype = "ScalarType::Float"
 check = "EXPECT_TENSOR_EQ" */
 
-TEST(OpUnsqueezeTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpUnsqueezeTest, DynamicShapeUpperBoundSameAsExpected) {
   /* %python
   out_args = "{2, 1, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND"
   %rewrite(unary_op) */
@@ -263,7 +263,7 @@ TEST(OpUnsqueezeTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpUnsqueezeTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpUnsqueezeTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -300,7 +300,7 @@ TEST(OpUnsqueezeTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpUnsqueezeTest, DynamicShapeUnbound) {
+TEST_F(OpUnsqueezeTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_var_test.cpp b/kernels/test/op_var_test.cpp
index f7505c2e8d..5501635c22 100644
--- a/kernels/test/op_var_test.cpp
+++ b/kernels/test/op_var_test.cpp
@@ -25,67 +25,211 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_var_out(
-    const Tensor& self,
-    optional<ArrayRef<int64_t>> dim,
-    bool unbiased,
-    bool keepdim,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::var_outf(
-      context, self, dim, unbiased, keepdim, out);
-}
-
-Tensor& op_var_correction_out(
-    const Tensor& self,
-    optional<ArrayRef<int64_t>> dim,
-    optional<Scalar>& correction,
-    bool keepdim,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::var_outf(
-      context, self, dim, correction, keepdim, out);
-}
-
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_var_out_invalid_dimensions() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2,  3,
-      4, 5, 6,  7,
-      8, 9, 10, 11,
+class OpVarOutTest : public OperatorTest {
+ protected:
+  Tensor& op_var_out(
+      const Tensor& self,
+      optional<ArrayRef<int64_t>> dim,
+      bool unbiased,
+      bool keepdim,
+      Tensor& out) {
+    return torch::executor::aten::var_outf(
+        context_, self, dim, unbiased, keepdim, out);
+  }
 
-      12, 13, 14, 15,
-      16, 17, 18, 19,
-      20, 21, 22, 23,
-    });
-  // clang-format on
-  Tensor out = tf_out.zeros({2, 3, 1});
-  optional<ScalarType> dtype = OUT_DTYPE;
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_var_out_invalid_dimensions() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2,  3,
+        4, 5, 6,  7,
+        8, 9, 10, 11,
+
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+      });
+    // clang-format on
+    Tensor out = tf_out.zeros({2, 3, 1});
+    optional<ScalarType> dtype = OUT_DTYPE;
+
+    // out-of-bound dim in dim list
+    int64_t dims_1[1] = {3};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_var_out(
+            self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out));
+
+    // the same dim appears multiple times in list of dims
+    int64_t dims_2[2] = {2, 2};
+    optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_var_out(
+            self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out));
+  }
 
-  // out-of-bound dim in dim list
-  int64_t dims_1[1] = {3};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(
-      op_var_out(
-          self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out),
-      "");
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_var_out_invalid_shape() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2,  3,
+        4, 5, 6,  7,
+        8, 9, 10, 11,
+
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+      });
+    // clang-format on
+
+    // dimension size mismatch when keepdim is true
+    Tensor out = tf_out.zeros({2, 4});
+    optional<ScalarType> dtype = OUT_DTYPE;
+    int64_t dims_1[1] = {1};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_var_out(
+            self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out));
+
+    // dimension size mismatch when keepdim is false
+    out = tf_out.zeros({2, 1, 4});
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        op_var_out(
+            self,
+            optional_dim_list,
+            /*unbiased=*/true,
+            /*keepdim=*/false,
+            out));
+  }
 
-  // the same dim appears multiple times in list of dims
-  int64_t dims_2[2] = {2, 2};
-  optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
-  ET_EXPECT_DEATH(
-      op_var_out(
-          self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out),
-      "");
-}
+  template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
+  void test_var_out_dtype() {
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+    // clang-format off
+    Tensor self = tf_in.make(
+      {2, 3, 4},
+      {
+        0, 1, 2,  3,
+        4, 5, 6,  7,
+        8, 9, 10, 11,
+
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23,
+      });
+    // clang-format on
+
+    // keepdim=true should work
+    Tensor out = tf_out.zeros({2, 3, 1});
+    int64_t dims_1[1] = {2};
+    optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
+    optional<ScalarType> dtype = OUT_DTYPE;
+    op_var_out(
+        self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 3, 1},
+      {
+        1.666667,
+        1.666667,
+        1.666667,
+
+        1.666667,
+        1.666667,
+        1.666667
+      }));
+    // clang-format on
+
+    // keepdim=false should work
+    out = tf_out.zeros({2, 3});
+    op_var_out(
+        self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/false, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 3},
+      {
+        1.666667, 1.666667, 1.666667,
+        1.666667, 1.666667, 1.666667
+      }));
+    // clang-format on
+
+    // dim list with multiple dimensions should work
+    out = tf_out.zeros({1, 1, 4});
+    int64_t dims_2[2] = {0, 1};
+    optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
+    op_var_out(
+        self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 4}, {56.0, 56.0, 56.0, 56.0}));
+
+    out = tf_out.zeros({4});
+    op_var_out(
+        self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/false, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({4}, {56.0, 56.0, 56.0, 56.0}));
+
+    // dim list with negative dimensions should work
+    out = tf_out.zeros({2, 1, 4});
+    int64_t dims_3[1] = {-2};
+    optional_dim_list = ArrayRef<int64_t>{dims_3, 1};
+    op_var_out(
+        self, optional_dim_list, /*unbiased=*/false, /*keepdim=*/true, out);
+    // clang-format off
+    EXPECT_TENSOR_CLOSE(out, tf_out.make(
+      {2, 1, 4},
+      {
+        10.666667, 10.666667, 10.666667, 10.666667,
+
+        10.666667, 10.666667, 10.666667, 10.666667,
+      }));
+    // clang-format on
+
+    // empty/null dim list should work
+    out = tf_out.zeros({1, 1, 1});
+    optional<ArrayRef<int64_t>> null_dim_list;
+    op_var_out(self, null_dim_list, /*unbiased=*/true, /*keepdim=*/true, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {50.0}));
+
+    optional<ArrayRef<int64_t>> empty_dim_list{ArrayRef<int64_t>{}};
+    op_var_out(self, empty_dim_list, /*unbiased=*/false, /*keepdim=*/true, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {47.916668}));
+
+    out = tf_out.zeros({});
+    op_var_out(self, null_dim_list, /*unbiased=*/false, /*keepdim=*/false, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {47.916668}));
+
+    op_var_out(self, empty_dim_list, /*unbiased=*/true, /*keepdim=*/false, out);
+    EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {50.0}));
+  }
+};
+
+class OpVarCorrectionOutTest : public OperatorTest {
+ protected:
+  Tensor& op_var_correction_out(
+      const Tensor& self,
+      optional<ArrayRef<int64_t>> dim,
+      optional<Scalar>& correction,
+      bool keepdim,
+      Tensor& out) {
+    return torch::executor::aten::var_outf(
+        context_, self, dim, correction, keepdim, out);
+  }
+};
 
-TEST(OpVarOutTest, InvalidDimensionListDies) {
+TEST_F(OpVarOutTest, InvalidDimensionListDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -103,44 +247,7 @@ TEST(OpVarOutTest, InvalidDimensionListDies) {
 #undef TEST_KERNEL
 }
 
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_var_out_invalid_shape() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2,  3,
-      4, 5, 6,  7,
-      8, 9, 10, 11,
-
-      12, 13, 14, 15,
-      16, 17, 18, 19,
-      20, 21, 22, 23,
-    });
-  // clang-format on
-
-  // dimension size mismatch when keepdim is true
-  Tensor out = tf_out.zeros({2, 4});
-  optional<ScalarType> dtype = OUT_DTYPE;
-  int64_t dims_1[1] = {1};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  ET_EXPECT_DEATH(
-      op_var_out(
-          self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out),
-      "");
-
-  // dimension size mismatch when keepdim is false
-  out = tf_out.zeros({2, 1, 4});
-  ET_EXPECT_DEATH(
-      op_var_out(
-          self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/false, out),
-      "");
-}
-
-TEST(OpVarOutTest, InvalidShapeDies) {
+TEST_F(OpVarOutTest, InvalidShapeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -158,7 +265,7 @@ TEST(OpVarOutTest, InvalidShapeDies) {
 #undef TEST_KERNEL
 }
 
-TEST(OpVarOutTest, InvalidDTypeDies) {
+TEST_F(OpVarOutTest, InvalidDTypeDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel test fails";
   }
@@ -184,110 +291,13 @@ TEST(OpVarOutTest, InvalidDTypeDies) {
   int64_t dims_1[1] = {2};
   optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
 
-  ET_EXPECT_DEATH(
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
       op_var_out(
-          self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out),
-      "");
-}
-
-template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
-void test_var_out_dtype() {
-  TensorFactory<IN_DTYPE> tf_in;
-  TensorFactory<OUT_DTYPE> tf_out;
-  // clang-format off
-  Tensor self = tf_in.make(
-    {2, 3, 4},
-    {
-      0, 1, 2,  3,
-      4, 5, 6,  7,
-      8, 9, 10, 11,
-
-      12, 13, 14, 15,
-      16, 17, 18, 19,
-      20, 21, 22, 23,
-    });
-  // clang-format on
-
-  // keepdim=true should work
-  Tensor out = tf_out.zeros({2, 3, 1});
-  int64_t dims_1[1] = {2};
-  optional<ArrayRef<int64_t>> optional_dim_list{ArrayRef<int64_t>{dims_1, 1}};
-  optional<ScalarType> dtype = OUT_DTYPE;
-  op_var_out(self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 3, 1},
-    {
-      1.666667,
-      1.666667,
-      1.666667,
-
-      1.666667,
-      1.666667,
-      1.666667
-    }));
-  // clang-format on
-
-  // keepdim=false should work
-  out = tf_out.zeros({2, 3});
-  op_var_out(
-      self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/false, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 3},
-    {
-      1.666667, 1.666667, 1.666667,
-      1.666667, 1.666667, 1.666667
-    }));
-  // clang-format on
-
-  // dim list with multiple dimensions should work
-  out = tf_out.zeros({1, 1, 4});
-  int64_t dims_2[2] = {0, 1};
-  optional_dim_list = ArrayRef<int64_t>{dims_2, 2};
-  op_var_out(self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 4}, {56.0, 56.0, 56.0, 56.0}));
-
-  out = tf_out.zeros({4});
-  op_var_out(
-      self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/false, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({4}, {56.0, 56.0, 56.0, 56.0}));
-
-  // dim list with negative dimensions should work
-  out = tf_out.zeros({2, 1, 4});
-  int64_t dims_3[1] = {-2};
-  optional_dim_list = ArrayRef<int64_t>{dims_3, 1};
-  op_var_out(
-      self, optional_dim_list, /*unbiased=*/false, /*keepdim=*/true, out);
-  // clang-format off
-  EXPECT_TENSOR_CLOSE(out, tf_out.make(
-    {2, 1, 4},
-    {
-      10.666667, 10.666667, 10.666667, 10.666667,
-
-      10.666667, 10.666667, 10.666667, 10.666667,
-    }));
-  // clang-format on
-
-  // empty/null dim list should work
-  out = tf_out.zeros({1, 1, 1});
-  optional<ArrayRef<int64_t>> null_dim_list;
-  op_var_out(self, null_dim_list, /*unbiased=*/true, /*keepdim=*/true, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {50.0}));
-
-  optional<ArrayRef<int64_t>> empty_dim_list{ArrayRef<int64_t>{}};
-  op_var_out(self, empty_dim_list, /*unbiased=*/false, /*keepdim=*/true, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({1, 1, 1}, {47.916668}));
-
-  out = tf_out.zeros({});
-  op_var_out(self, null_dim_list, /*unbiased=*/false, /*keepdim=*/false, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {47.916668}));
-
-  op_var_out(self, empty_dim_list, /*unbiased=*/true, /*keepdim=*/false, out);
-  EXPECT_TENSOR_CLOSE(out, tf_out.make({}, {50.0}));
+          self, optional_dim_list, /*unbiased=*/true, /*keepdim=*/true, out));
 }
 
-TEST(OpVarOutTest, AllFloatInputFloatOutputPasses) {
+TEST_F(OpVarOutTest, AllFloatInputFloatOutputPasses) {
   // Use a two layer switch to hanldle each possible data pair
 #define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
   test_var_out_dtype<ScalarType::INPUT_DTYPE, ScalarType::OUTPUT_DTYPE>();
@@ -300,7 +310,7 @@ TEST(OpVarOutTest, AllFloatInputFloatOutputPasses) {
 #undef TEST_KERNEL
 }
 
-TEST(OpVarOutTest, InfinityAndNANTest) {
+TEST_F(OpVarOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Float> tf_float;
   // clang-format off
   Tensor self = tf_float.make(
@@ -336,7 +346,7 @@ TEST(OpVarOutTest, InfinityAndNANTest) {
   // clang-format on
 }
 
-TEST(OpVarOutTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpVarOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({3, 2}, {0.49, 0.40, 0.56, 0.38, 0.49, 0.56});
@@ -349,7 +359,7 @@ TEST(OpVarOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpVarOutTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpVarOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({3, 2}, {0.49, 0.40, 0.56, 0.38, 0.49, 0.56});
@@ -362,7 +372,7 @@ TEST(OpVarOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpVarOutTest, DynamicShapeUnbound) {
+TEST_F(OpVarOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape unbound not supported";
   TensorFactory<ScalarType::Float> tf;
 
@@ -376,7 +386,7 @@ TEST(OpVarOutTest, DynamicShapeUnbound) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST(OpVarCorrectionOutTest, SmokeTest) {
+TEST_F(OpVarCorrectionOutTest, SmokeTest) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({2, 3}, {4.9, 4.0, 5.6, 3.8, 4.9, 5.6});
diff --git a/kernels/test/op_view_copy_test.cpp b/kernels/test/op_view_copy_test.cpp
index bb5923d514..1bd3757c1d 100644
--- a/kernels/test/op_view_copy_test.cpp
+++ b/kernels/test/op_view_copy_test.cpp
@@ -22,38 +22,105 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_view_copy_out(const Tensor& self, IntArrayRef size, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::view_copy_outf(context, self, size, out);
-}
+class OpViewTest : public OperatorTest {
+ protected:
+  Tensor& op_view_copy_out(const Tensor& self, IntArrayRef size, Tensor& out) {
+    return torch::executor::aten::view_copy_outf(context_, self, size, out);
+  }
 
-namespace {
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void run_view_test_cases(
-    const Tensor& input,
-    const std::vector<std::vector<int32_t>>& out_shapes) {
-  TensorFactory<DTYPE> tf;
-  for (std::vector<int32_t> size : out_shapes) {
-    Tensor out = tf.ones(size);
-
-    // The interface of op_view_copy_out should use int64_t as int, while tensor
-    // size needs int32_t so we need to transfrom from int32_t to int64_t to
-    // pass the size to op_view_copy_out function
-    std::vector<int64_t> size_int64_t(size.size());
-    std::transform(
-        size.begin(), size.end(), size_int64_t.begin(), [](int32_t x) {
-          return (int64_t)x;
-        });
-
-    Tensor ret = op_view_copy_out(
-        input,
-        exec_aten::ArrayRef<int64_t>(size_int64_t.data(), size_int64_t.size()),
-        out);
-    EXPECT_TENSOR_EQ(out, ret);
-    EXPECT_TENSOR_DATA_EQ(input, out);
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void run_view_test_cases(
+      const Tensor& input,
+      const std::vector<std::vector<int32_t>>& out_shapes) {
+    TensorFactory<DTYPE> tf;
+    for (std::vector<int32_t> size : out_shapes) {
+      Tensor out = tf.ones(size);
+
+      // The interface of op_view_copy_out should use int64_t as int, while
+      // tensor size needs int32_t so we need to transfrom from int32_t to
+      // int64_t to pass the size to op_view_copy_out function
+      std::vector<int64_t> size_int64_t(size.size());
+      std::transform(
+          size.begin(), size.end(), size_int64_t.begin(), [](int32_t x) {
+            return (int64_t)x;
+          });
+
+      Tensor ret = op_view_copy_out(
+          input,
+          exec_aten::ArrayRef<int64_t>(
+              size_int64_t.data(), size_int64_t.size()),
+          out);
+      EXPECT_TENSOR_EQ(out, ret);
+      EXPECT_TENSOR_DATA_EQ(input, out);
+    }
   }
-}
 
+  // Test if op_view_copy_out works well under all kinds of legal input type.
+  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  void test_dtype() {
+    TensorFactory<DTYPE> tf;
+    Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{0, 1, 1, 1, 0, 1, 0, 1});
+
+    // Differne kinds of output shape meet the requirement (have same numel as
+    // input)
+    std::vector<std::vector<int32_t>> out_shapes = {
+        {8},
+        {8, 1},
+        {1, 8},
+        {2, 4},
+        {4, 2},
+        {2, 2, 2},
+        {1, 2, 1, 2, 1, 2, 1},
+    };
+
+    run_view_test_cases<CTYPE, DTYPE>(input, out_shapes);
+  }
+
+  template <class CTYPE, ScalarType DTYPE>
+  void test_empty_input() {
+    TensorFactory<DTYPE> tf;
+    Tensor input = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
+    // Differnet kinds of output shape meet the requirement (have same numel as
+    // input)
+    std::vector<std::vector<int32_t>> out_shapes = {
+        {6, 0}, {6, 0, 0}, {3, 0, 1, 2}, {1, 0, 2, 3}};
+    run_view_test_cases<CTYPE, DTYPE>(input, out_shapes);
+  }
+
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  x = torch.randint(10, (3, 4))
+  res = x.view(2, 6)
+  op = "op_view_copy_out"
+  opt_setup_params = """
+    int64_t size[] = {2, 6};
+  """
+  opt_extra_params = "size,"
+  out_args = "out_shape, dynamism"
+  dtype = "ScalarType::Int"
+  check = "EXPECT_TENSOR_EQ" */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(unary_op) */
+
+    TensorFactory<ScalarType::Int> tf;
+
+    Tensor x = tf.make({3, 4}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6});
+    Tensor expected = tf.make({2, 6}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6});
+
+    int64_t size[] = {2, 6};
+
+    Tensor out = tf.zeros(out_shape, dynamism);
+    op_view_copy_out(x, size, out);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+};
+
+namespace {
 std::vector<int64_t> vector_32_to_64(std::vector<int32_t> vector_32) {
   std::vector<int64_t> vector_64(vector_32.size());
   std::transform(
@@ -66,51 +133,19 @@ std::vector<int64_t> vector_32_to_64(std::vector<int32_t> vector_32) {
 } // namespace
 
 // Regular test for op_view_copy_out.
-// Test if op_view_copy_out works well under all kinds of legal input type.
-template <class CTYPE, exec_aten::ScalarType DTYPE>
-void test_dtype() {
-  TensorFactory<DTYPE> tf;
-  Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{0, 1, 1, 1, 0, 1, 0, 1});
-
-  // Differne kinds of output shape meet the requirement (have same numel as
-  // input)
-  std::vector<std::vector<int32_t>> out_shapes = {
-      {8},
-      {8, 1},
-      {1, 8},
-      {2, 4},
-      {4, 2},
-      {2, 2, 2},
-      {1, 2, 1, 2, 1, 2, 1},
-  };
-
-  run_view_test_cases<CTYPE, DTYPE>(input, out_shapes);
-}
-
-TEST(OpViewTest, AllDtypesSupported) {
+TEST_F(OpViewTest, AllDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-template <class CTYPE, ScalarType DTYPE>
-void test_empty_input() {
-  TensorFactory<DTYPE> tf;
-  Tensor input = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
-  // Differnet kinds of output shape meet the requirement (have same numel as
-  // input)
-  std::vector<std::vector<int32_t>> out_shapes = {
-      {6, 0}, {6, 0, 0}, {3, 0, 1, 2}, {1, 0, 2, 3}};
-  run_view_test_cases<CTYPE, DTYPE>(input, out_shapes);
-}
-
-TEST(OpViewTest, EmptyInputSupported) {
+TEST_F(OpViewTest, EmptyInputSupported) {
 #define TEST_ENTRY(ctype, dtype) test_empty_input<ctype, ScalarType::dtype>();
   ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
-TEST(OpViewTest, InputOutputMismatchedSizesDie) {
+TEST_F(OpViewTest, InputOutputMismatchedSizesDie) {
   TensorFactory<ScalarType::Int> tf;
   std::vector<int32_t> size_in = {3, 1, 1, 2};
   std::vector<int32_t> size_out = {3, 2, 1, 2};
@@ -124,13 +159,16 @@ TEST(OpViewTest, InputOutputMismatchedSizesDie) {
   std::vector<int64_t> size_int64_t = vector_32_to_64(size_out);
 
   // The numel of input and output tensor should be same
-  ET_EXPECT_KERNEL_FAILURE(op_view_copy_out(
-      input,
-      exec_aten::ArrayRef<int64_t>(size_int64_t.data(), size_int64_t.size()),
-      out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_view_copy_out(
+          input,
+          exec_aten::ArrayRef<int64_t>(
+              size_int64_t.data(), size_int64_t.size()),
+          out));
 }
 
-TEST(OpViewTest, SizeOutputMismatchedSizesDie) {
+TEST_F(OpViewTest, SizeOutputMismatchedSizesDie) {
   TensorFactory<ScalarType::Int> tf;
   std::vector<int32_t> size = {3, 1, 1, 2};
   std::vector<int32_t> size_target = {3, 2, 1, 2};
@@ -143,13 +181,16 @@ TEST(OpViewTest, SizeOutputMismatchedSizesDie) {
   std::vector<int64_t> size_int64_t = vector_32_to_64(size_target);
 
   // The target size and out.size() should be same
-  ET_EXPECT_KERNEL_FAILURE(op_view_copy_out(
-      input,
-      exec_aten::ArrayRef<int64_t>(size_int64_t.data(), size_int64_t.size()),
-      out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_view_copy_out(
+          input,
+          exec_aten::ArrayRef<int64_t>(
+              size_int64_t.data(), size_int64_t.size()),
+          out));
 }
 
-TEST(OpViewTest, MismatchedTypesDie) {
+TEST_F(OpViewTest, MismatchedTypesDie) {
   TensorFactory<ScalarType::Int> tf_in;
   TensorFactory<ScalarType::Float> tf_out;
   std::vector<int32_t> size = {3, 1, 1, 2};
@@ -163,13 +204,16 @@ TEST(OpViewTest, MismatchedTypesDie) {
   std::vector<int64_t> size_int64_t = vector_32_to_64(size);
 
   // DTYPE of input and output should be same.
-  ET_EXPECT_KERNEL_FAILURE(op_view_copy_out(
-      input,
-      exec_aten::ArrayRef<int64_t>(size_int64_t.data(), size_int64_t.size()),
-      out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_view_copy_out(
+          input,
+          exec_aten::ArrayRef<int64_t>(
+              size_int64_t.data(), size_int64_t.size()),
+          out));
 }
 
-TEST(OpViewTest, SizeInfer) {
+TEST_F(OpViewTest, SizeInfer) {
   TensorFactory<ScalarType::Float> tf_in;
   TensorFactory<ScalarType::Float> tf_out_valid, tf_out_invalid;
   std::vector<int32_t> in_size = {2, 2, 2};
@@ -194,15 +238,17 @@ TEST(OpViewTest, SizeInfer) {
       out);
   EXPECT_TENSOR_DATA_EQ(input, out);
   // Inferring two dimensions is invalid.
-  ET_EXPECT_KERNEL_FAILURE(op_view_copy_out(
-      input,
-      exec_aten::ArrayRef<int64_t>(
-          invalid_size_int64_t.data(), invalid_size_int64_t.size()),
-      out));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_view_copy_out(
+          input,
+          exec_aten::ArrayRef<int64_t>(
+              invalid_size_int64_t.data(), invalid_size_int64_t.size()),
+          out));
 }
 
 #if !defined(USE_ATEN_LIB)
-TEST(OpViewTest, UpperBoundOutTensor) {
+TEST_F(OpViewTest, UpperBoundOutTensor) {
   TensorFactory<ScalarType::Float> tf;
   Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{0, 1, 1, 1, 0, 1, 0, 1});
   Tensor output = tf.zeros(
@@ -239,44 +285,12 @@ TEST(OpViewTest, UpperBoundOutTensor) {
 }
 #endif
 
-/* %python
-import torch
-torch.manual_seed(0)
-x = torch.randint(10, (3, 4))
-res = x.view(2, 6)
-op = "op_view_copy_out"
-opt_setup_params = """
-  int64_t size[] = {2, 6};
-"""
-opt_extra_params = "size,"
-out_args = "out_shape, dynamism"
-dtype = "ScalarType::Int"
-check = "EXPECT_TENSOR_EQ" */
-
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(unary_op) */
-
-  TensorFactory<ScalarType::Int> tf;
-
-  Tensor x = tf.make({3, 4}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6});
-  Tensor expected = tf.make({2, 6}, {4, 9, 3, 0, 3, 9, 7, 3, 7, 3, 1, 6});
-
-  int64_t size[] = {2, 6};
-
-  Tensor out = tf.zeros(out_shape, dynamism);
-  op_view_copy_out(x, size, out);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpViewTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpViewTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 6}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpViewTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpViewTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -284,7 +298,7 @@ TEST(OpViewTest, DynamicShapeUpperBoundLargerThanExpected) {
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpViewTest, DynamicShapeUnbound) {
+TEST_F(OpViewTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_where_test.cpp b/kernels/test/op_where_test.cpp
index 6c1474b6e6..f02edaa88b 100644
--- a/kernels/test/op_where_test.cpp
+++ b/kernels/test/op_where_test.cpp
@@ -21,95 +21,146 @@ using exec_aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_where_self_out(
-    const Tensor& condition,
-    const Tensor& self,
-    const Tensor& other,
-    Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::where_outf(
-      context, condition, self, other, out);
-}
-
-//
-// Correctness Test
-//
-
-template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
-void test_where() {
-  if (DTYPE_OUT == ScalarType::Byte || DTYPE_OUT == ScalarType::Char) {
-    return;
+class OpWhereOutTest : public OperatorTest {
+ protected:
+  Tensor& op_where_self_out(
+      const Tensor& condition,
+      const Tensor& self,
+      const Tensor& other,
+      Tensor& out) {
+    return torch::executor::aten::where_outf(
+        context_, condition, self, other, out);
   }
-  TensorFactory<ScalarType::Bool> tf_condition;
-  TensorFactory<DTYPE_A> tf_a;
-  TensorFactory<DTYPE_B> tf_b;
-  TensorFactory<DTYPE_OUT> tf_out;
 
-  const std::vector<int32_t> condition_sizes = {12};
-  const std::vector<int32_t> sizes = {1, 12};
-
-  Tensor out = tf_out.zeros(sizes);
-
-  // clang-format off
-  op_where_self_out(
-      tf_condition.make(condition_sizes, /*data=*/{false, true, false, true, true, false,
-                                                   false, true, false, true, true, false}),
-      tf_a.make(sizes, /*data=*/{  1,  2,  3,  4,  5,  6,  6,  5,  4,  3,  2,  1}),
-      tf_b.make(sizes, /*data=*/{  6,  5,  4,  3,  2,  1,  1,  2,  3,  4,  5,  6}),
-      out);
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(
-      out,
-      tf_out.make(
-          sizes, /*data=*/{  6,  2,  4,  4,  5,  1,  1,  5,  3,  3,  2,  6}));
-  // clang-format on
-}
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
+  void test_where() {
+    if (DTYPE_OUT == ScalarType::Byte || DTYPE_OUT == ScalarType::Char) {
+      return;
+    }
+    TensorFactory<ScalarType::Bool> tf_condition;
+    TensorFactory<DTYPE_A> tf_a;
+    TensorFactory<DTYPE_B> tf_b;
+    TensorFactory<DTYPE_OUT> tf_out;
+
+    const std::vector<int32_t> condition_sizes = {12};
+    const std::vector<int32_t> sizes = {1, 12};
+
+    Tensor out = tf_out.zeros(sizes);
+
+    // clang-format off
+    op_where_self_out(
+        tf_condition.make(condition_sizes, /*data=*/{false, true, false, true, true, false,
+                                                     false, true, false, true, true, false}),
+        tf_a.make(sizes, /*data=*/{  1,  2,  3,  4,  5,  6,  6,  5,  4,  3,  2,  1}),
+        tf_b.make(sizes, /*data=*/{  6,  5,  4,  3,  2,  1,  1,  2,  3,  4,  5,  6}),
+        out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf_out.make(
+            sizes, /*data=*/{  6,  2,  4,  4,  5,  1,  1,  5,  3,  3,  2,  6}));
+    // clang-format on
+  }
 
-template <ScalarType DTYPE_A, ScalarType DTYPE_B>
-void test_where_enumerate_out_types() {
+  template <ScalarType DTYPE_A, ScalarType DTYPE_B>
+  void test_where_enumerate_out_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_where<DTYPE_A, DTYPE_B, ScalarType::dtype>();
 
-  ET_FORALL_FLOAT_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_FLOAT_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
 
-template <ScalarType DTYPE_A>
-void test_where_enumerate_b_types() {
+  template <ScalarType DTYPE_A>
+  void test_where_enumerate_b_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_where<DTYPE_A, ScalarType::dtype, DTYPE_A>();
 
-  ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(where_template) */
+
+    TensorFactory<ScalarType::Bool> tfBool;
+    TensorFactory<ScalarType::Float> tf;
+
+    Tensor condition = tfBool.make(
+        {2, 3, 4}, {true,  false, true, true,  true,  false, false, true,
+                    false, true,  true, false, false, false, false, false,
+                    false, false, true, true,  false, false, true,  true});
+    Tensor input = tf.make(
+        {2, 3, 4},
+        {0.41940832138061523,  0.5529070496559143,   0.9527381062507629,
+         0.036164820194244385, 0.1852310299873352,   0.37341737747192383,
+         0.3051000237464905,   0.9320003986358643,   0.17591017484664917,
+         0.2698335647583008,   0.15067976713180542,  0.03171950578689575,
+         0.20812976360321045,  0.9297990202903748,   0.7231091856956482,
+         0.7423362731933594,   0.5262957811355591,   0.24365824460983276,
+         0.584592342376709,    0.033152639865875244, 0.13871687650680542,
+         0.242235004901886,    0.815468966960907,    0.793160617351532});
+    Tensor other = tf.make(
+        {2, 3, 4},
+        {0.2782524824142456,  0.48195880651474,   0.8197803497314453,
+         0.9970665574073792,  0.6984410881996155, 0.5675464272499084,
+         0.8352431654930115,  0.2055988311767578, 0.593172013759613,
+         0.11234724521636963, 0.1534569263458252, 0.24170821905136108,
+         0.7262365221977234,  0.7010802030563354, 0.2038237452507019,
+         0.6510535478591919,  0.7744860053062439, 0.4368913173675537,
+         0.5190907716751099,  0.6158523559570312, 0.8101882934570312,
+         0.9800970554351807,  0.1146882176399231, 0.3167651295661926});
+    Tensor expected = tf.make(
+        {2, 3, 4},
+        {0.41940832138061523,  0.48195880651474,     0.9527381062507629,
+         0.036164820194244385, 0.1852310299873352,   0.5675464272499084,
+         0.8352431654930115,   0.9320003986358643,   0.593172013759613,
+         0.2698335647583008,   0.15067976713180542,  0.24170821905136108,
+         0.7262365221977234,   0.7010802030563354,   0.2038237452507019,
+         0.6510535478591919,   0.7744860053062439,   0.4368913173675537,
+         0.584592342376709,    0.033152639865875244, 0.8101882934570312,
+         0.9800970554351807,   0.815468966960907,    0.793160617351532});
+    Tensor out = tf.zeros(out_shape, dynamism);
+
+    op_where_self_out(condition, input, other, out);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
 
-void test_where_enumerate_a_types() {
+  void test_where_enumerate_a_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_where_enumerate_b_types<ScalarType::dtype>();
 
-  ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
 
-void test_where_enumerate_a_types_aten() {
+  void test_where_enumerate_a_types_aten() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_where<ScalarType::dtype, ScalarType::dtype, ScalarType::dtype>();
 
-  ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
-}
+  }
+};
+
+//
+// Correctness Test
+//
 
-TEST(OpWhereOutKernelTest, AllRealDtypesSupported) {
+TEST_F(OpWhereOutTest, AllRealDtypesSupported) {
   test_where_enumerate_a_types_aten();
 }
 
 // Condition is true, all items will be from x
-TEST(OpWhereOutKernelTest, AllTrueTest) {
+TEST_F(OpWhereOutTest, AllTrueTest) {
   TensorFactory<ScalarType::Bool> tf_condition;
   TensorFactory<ScalarType::Float> tf_x;
   TensorFactory<ScalarType::Float> tf_y;
@@ -139,7 +190,7 @@ TEST(OpWhereOutKernelTest, AllTrueTest) {
 }
 
 // Condition is false, all items will be from y
-TEST(OpWhereOutKernelTest, AllFalseTest) {
+TEST_F(OpWhereOutTest, AllFalseTest) {
   TensorFactory<ScalarType::Bool> tf_condition;
   TensorFactory<ScalarType::Float> tf_x;
   TensorFactory<ScalarType::Float> tf_y;
@@ -170,7 +221,7 @@ TEST(OpWhereOutKernelTest, AllFalseTest) {
 }
 
 // Choosing based on condition[i] ? x[i] : y[i]
-TEST(OpWhereOutKernelTest, MixedTrueFalseTest) {
+TEST_F(OpWhereOutTest, MixedTrueFalseTest) {
   TensorFactory<ScalarType::Bool> tf_condition;
   TensorFactory<ScalarType::Float> tf_x;
   TensorFactory<ScalarType::Float> tf_y;
@@ -202,7 +253,7 @@ TEST(OpWhereOutKernelTest, MixedTrueFalseTest) {
 }
 
 // Choosing based on condition[i] ? x[i] : y[i]
-TEST(OpWhereOutKernelTest, BroadcastConditionTest) {
+TEST_F(OpWhereOutTest, BroadcastConditionTest) {
   TensorFactory<ScalarType::Bool> tf_condition;
   TensorFactory<ScalarType::Float> tf_x;
   TensorFactory<ScalarType::Float> tf_y;
@@ -242,7 +293,7 @@ TEST(OpWhereOutKernelTest, BroadcastConditionTest) {
 }
 
 // Choosing based on condition[i] ? x[i] : y[i]
-TEST(OpWhereOutKernelTest, BroadcastConditionAndBroadCastYTest) {
+TEST_F(OpWhereOutTest, BroadcastConditionAndBroadCastYTest) {
   TensorFactory<ScalarType::Bool> tf_condition;
   TensorFactory<ScalarType::Float> tf_x;
   TensorFactory<ScalarType::Float> tf_y;
@@ -283,7 +334,7 @@ TEST(OpWhereOutKernelTest, BroadcastConditionAndBroadCastYTest) {
 }
 
 // Choosing based on condition[i] ? x[i] : y[i]
-TEST(OpWhereOutKernelTest, DoubleTypeTest) {
+TEST_F(OpWhereOutTest, DoubleTypeTest) {
   TensorFactory<ScalarType::Bool> tf_condition;
   TensorFactory<ScalarType::Double> tf_x;
   TensorFactory<ScalarType::Double> tf_y;
@@ -324,7 +375,7 @@ TEST(OpWhereOutKernelTest, DoubleTypeTest) {
 }
 
 // Choosing based on condition[i] ? x[i] : y[i]
-TEST(OpWhereOutKernelTest, MismatchedShapeTest) {
+TEST_F(OpWhereOutTest, MismatchedShapeTest) {
   TensorFactory<ScalarType::Bool> tf_condition;
   TensorFactory<ScalarType::Float> tf_x;
   TensorFactory<ScalarType::Double> tf_y;
@@ -338,7 +389,7 @@ TEST(OpWhereOutKernelTest, MismatchedShapeTest) {
   Tensor out = tf_out.zeros(x_sizes);
 
   // clang-format off
-  ET_EXPECT_KERNEL_FAILURE(op_where_self_out(
+  ET_EXPECT_KERNEL_FAILURE(context_, op_where_self_out(
       tf_condition.make(condition_sizes, /*data=*/{
                                   false,
                                   true,
@@ -378,60 +429,12 @@ where_template = f"""
   op_where_self_out(condition, input, other, out);
   EXPECT_TENSOR_EQ(out, expected);""" */
 
-void test_dynamic_shape(
-    const std::vector<int32_t>& out_shape,
-    enum torch::executor::TensorShapeDynamism dynamism) {
-  /* %python
-  %rewrite(where_template) */
-
-  TensorFactory<ScalarType::Bool> tfBool;
-  TensorFactory<ScalarType::Float> tf;
-
-  Tensor condition = tfBool.make(
-      {2, 3, 4}, {true,  false, true, true,  true,  false, false, true,
-                  false, true,  true, false, false, false, false, false,
-                  false, false, true, true,  false, false, true,  true});
-  Tensor input = tf.make(
-      {2, 3, 4},
-      {0.41940832138061523,  0.5529070496559143,   0.9527381062507629,
-       0.036164820194244385, 0.1852310299873352,   0.37341737747192383,
-       0.3051000237464905,   0.9320003986358643,   0.17591017484664917,
-       0.2698335647583008,   0.15067976713180542,  0.03171950578689575,
-       0.20812976360321045,  0.9297990202903748,   0.7231091856956482,
-       0.7423362731933594,   0.5262957811355591,   0.24365824460983276,
-       0.584592342376709,    0.033152639865875244, 0.13871687650680542,
-       0.242235004901886,    0.815468966960907,    0.793160617351532});
-  Tensor other = tf.make(
-      {2, 3, 4}, {0.2782524824142456,  0.48195880651474,   0.8197803497314453,
-                  0.9970665574073792,  0.6984410881996155, 0.5675464272499084,
-                  0.8352431654930115,  0.2055988311767578, 0.593172013759613,
-                  0.11234724521636963, 0.1534569263458252, 0.24170821905136108,
-                  0.7262365221977234,  0.7010802030563354, 0.2038237452507019,
-                  0.6510535478591919,  0.7744860053062439, 0.4368913173675537,
-                  0.5190907716751099,  0.6158523559570312, 0.8101882934570312,
-                  0.9800970554351807,  0.1146882176399231, 0.3167651295661926});
-  Tensor expected = tf.make(
-      {2, 3, 4},
-      {0.41940832138061523,  0.48195880651474,     0.9527381062507629,
-       0.036164820194244385, 0.1852310299873352,   0.5675464272499084,
-       0.8352431654930115,   0.9320003986358643,   0.593172013759613,
-       0.2698335647583008,   0.15067976713180542,  0.24170821905136108,
-       0.7262365221977234,   0.7010802030563354,   0.2038237452507019,
-       0.6510535478591919,   0.7744860053062439,   0.4368913173675537,
-       0.584592342376709,    0.033152639865875244, 0.8101882934570312,
-       0.9800970554351807,   0.815468966960907,    0.793160617351532});
-  Tensor out = tf.zeros(out_shape, dynamism);
-
-  op_where_self_out(condition, input, other, out);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST(OpWhereOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpWhereOutTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpWhereOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpWhereOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
@@ -439,7 +442,7 @@ TEST(OpWhereOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
       {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST(OpWhereOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpWhereOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape not supported";
   }
diff --git a/kernels/test/op_zeros_test.cpp b/kernels/test/op_zeros_test.cpp
index f50d6d3528..8c604853e2 100644
--- a/kernels/test/op_zeros_test.cpp
+++ b/kernels/test/op_zeros_test.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
 #include <executorch/kernels/test/supported_features.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
@@ -21,25 +22,27 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-Tensor& op_zeros_out(IntArrayRef size, Tensor& out) {
-  exec_aten::RuntimeContext context{};
-  return torch::executor::aten::zeros_outf(context, size, out);
-}
+class OpZerosOutTest : public OperatorTest {
+ protected:
+  Tensor& op_zeros_out(IntArrayRef size, Tensor& out) {
+    return torch::executor::aten::zeros_outf(context_, size, out);
+  }
 
-template <ScalarType DTYPE>
-void test_zeros_out(std::vector<int32_t>&& size_int32_t) {
-  TensorFactory<DTYPE> tf;
-  std::vector<int64_t> sizes(size_int32_t.begin(), size_int32_t.end());
-  auto aref = exec_aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
-  Tensor out = tf.ones(size_int32_t);
+  template <ScalarType DTYPE>
+  void test_zeros_out(std::vector<int32_t>&& size_int32_t) {
+    TensorFactory<DTYPE> tf;
+    std::vector<int64_t> sizes(size_int32_t.begin(), size_int32_t.end());
+    auto aref = exec_aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
+    Tensor out = tf.ones(size_int32_t);
 
-  op_zeros_out(aref, out);
+    op_zeros_out(aref, out);
 
-  EXPECT_TENSOR_EQ(out, tf.zeros(size_int32_t));
-}
+    EXPECT_TENSOR_EQ(out, tf.zeros(size_int32_t));
+  }
+};
 
 #define GENERATE_TEST(_, DTYPE)                   \
-  TEST(OpZerosOutKernelTest, DTYPE##Tensors) {    \
+  TEST_F(OpZerosOutTest, DTYPE##Tensors) {        \
     test_zeros_out<ScalarType::DTYPE>({2, 3, 4}); \
     test_zeros_out<ScalarType::DTYPE>({2, 0, 4}); \
     test_zeros_out<ScalarType::DTYPE>({});        \
@@ -47,7 +50,7 @@ void test_zeros_out(std::vector<int32_t>&& size_int32_t) {
 
 ET_FORALL_REAL_TYPES_AND(Bool, GENERATE_TEST)
 
-TEST(OpZerosOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
+TEST_F(OpZerosOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
   Tensor expected = tf.zeros({3, 2});
 
@@ -59,7 +62,7 @@ TEST(OpZerosOutKernelTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpZerosOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
+TEST_F(OpZerosOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
   Tensor expected = tf.zeros({3, 2});
 
@@ -71,7 +74,7 @@ TEST(OpZerosOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST(OpZerosOutKernelTest, DynamicShapeUnbound) {
+TEST_F(OpZerosOutTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 789179c4ca..1daa66c58f 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -1,14 +1,14 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/kernels/test:util.bzl", "codegen_function_header_wrapper", "generated_op_test", "op_test")
 
-def _common_op_test(name, kernels, aten_compatible = True):
+def _common_op_test(name, kernels):
     """
     Defines test targets in format of <kernel>_op_<op-name>_test
     For ATen kernel testing, let's use portable functions.yaml for tested ops.
     """
     for kernel in kernels:
         deps = [":function_header_wrapper_{}".format(kernel)]
-        op_test(name, aten_compatible = aten_compatible, kernel_name = kernel, use_kernel_prefix = True, deps = deps)
+        op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = deps)
 
 def make_example_generated_op_test_target():
     """
@@ -50,10 +50,12 @@ def define_common_targets(is_fbcode = False):
             fbcode_exported_deps = [
                 "//common/init:init",
                 "//common/gtest:gtest",
+                "//executorch/runtime/kernel:kernel_includes",
             ],
             xplat_exported_deps = [
                 "//xplat/folly:init_init",
                 "//third-party/googletest:gtest_main",
+                "//executorch/runtime/kernel:kernel_includes",
             ],
         )
 
diff --git a/kernels/test/util.bzl b/kernels/test/util.bzl
index 0efeb49774..dc31702526 100644
--- a/kernels/test/util.bzl
+++ b/kernels/test/util.bzl
@@ -1,7 +1,7 @@
 load("@fbsource//tools/build_defs:fbsource_utils.bzl", "is_xplat")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-def op_test(name, deps = [], aten_compatible = True, kernel_name = "portable", use_kernel_prefix = False):
+def op_test(name, deps = [], kernel_name = "portable", use_kernel_prefix = False):
     """Defines a cxx_test() for an "op_*_test.cpp" file.
 
     Args:
@@ -11,8 +11,6 @@ def op_test(name, deps = [], aten_compatible = True, kernel_name = "portable", u
             under //kernels/<kernel>/...; e.g., "op_add_test" will depend on
             "//kernels/portable/cpu:op_add".
         deps: Optional extra deps to add to the cxx_test().
-        aten_compatible: If True, the operator under test is ATen-compatible
-            (i.e., appears in `functions.yaml`).
         kernel_name: The name string as in //executorch/kernels/<kernel_name>.
         use_kernel_prefix: If True, the target name is
             <kernel>_op_<operator-group-name>_test. Used by common kernel testing.
@@ -51,6 +49,7 @@ def op_test(name, deps = [], aten_compatible = True, kernel_name = "portable", u
         deps = [
             "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util" + aten_suffix,
+            "//executorch/runtime/kernel:kernel_includes" + aten_suffix,
             "//executorch/kernels/test:test_util" + aten_suffix,
         ] + generated_lib_and_op_deps + deps,
     )
@@ -84,6 +83,7 @@ def generated_op_test(name, op_impl_target, generated_lib_headers_target, suppor
         deps = [
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+            "//executorch/runtime/kernel:kernel_includes",
             "//executorch/kernels/test:test_util",
             op_impl_target,
             generated_lib_headers_target,
diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp
index 9be3db6051..815e86bcb8 100644
--- a/runtime/core/exec_aten/testing_util/tensor_util.cpp
+++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp
@@ -251,7 +251,7 @@ std::ostream& operator<<(std::ostream& os, const Tensor& t) {
     break;
 
   switch (t.scalar_type()) {
-    ET_FORALL_REAL_TYPES_AND(Bool, PRINT_CASE)
+    ET_FORALL_REAL_TYPES_AND2(Half, Bool, PRINT_CASE)
     default:
       ET_CHECK_MSG(
           false,
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index f831f826f5..c1917d1dd9 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -360,6 +360,26 @@ inline bool isFloatingType(exec_aten::ScalarType t) {
       t == exec_aten::ScalarType::Half || t == exec_aten::ScalarType::BFloat16);
 }
 
+inline bool isRealType(exec_aten::ScalarType t) {
+  return (
+      t == exec_aten::ScalarType::Byte || t == exec_aten::ScalarType::Char ||
+      t == exec_aten::ScalarType::Short || t == exec_aten::ScalarType::Int ||
+      t == exec_aten::ScalarType::Long || t == exec_aten::ScalarType::Float ||
+      t == exec_aten::ScalarType::Double);
+}
+
+inline bool isRealHType(exec_aten::ScalarType t) {
+  return (
+      t == exec_aten::ScalarType::Byte || t == exec_aten::ScalarType::Char ||
+      t == exec_aten::ScalarType::Short || t == exec_aten::ScalarType::Int ||
+      t == exec_aten::ScalarType::Long || t == exec_aten::ScalarType::Float ||
+      t == exec_aten::ScalarType::Double || t == exec_aten::ScalarType::Half);
+}
+
+inline bool isRealHBType(exec_aten::ScalarType t) {
+  return (isRealHType(t) || t == exec_aten::ScalarType::Bool);
+}
+
 inline bool isComplexType(exec_aten::ScalarType t) {
   return (
       t == exec_aten::ScalarType::ComplexHalf ||
diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h
index f7a4a8d2a9..c5c663e28c 100644
--- a/runtime/core/exec_aten/util/tensor_util.h
+++ b/runtime/core/exec_aten/util/tensor_util.h
@@ -357,9 +357,6 @@
  * If `cond` is false, log `cond` and return from the kernel with a failure
  * state set.
  *
- * TODO(ssjia): add context.fail(torch.executor::Error::error); before exit
- * TODO(ssjia): replace runtime_abort() with return retval
- *
  * @param[in] context the runtime context
  * @param[in] cond the condition to check
  * @param[in] error torch::executor::Error enum value (e.g `InvalidArgument`)
@@ -369,7 +366,8 @@
   do {                                                \
     if (!(cond)) {                                    \
       ET_LOG(Error, "Check failed (%s): ", #cond);    \
-      torch::executor::runtime_abort();               \
+      context.fail(torch::executor::Error::error);    \
+      return retval;                                  \
     }                                                 \
   } while (false)
 
@@ -377,9 +375,6 @@
  * If `cond` is false, log `message` and return from the kernel with a failure
  * state set.
  *
- * TODO(ssjia): add context.fail(torch.executor::Error::error); before exit
- * TODO(ssjia): replace runtime_abort() with return retval
- *
  * @param[in] context the runtime context
  * @param[in] cond the condition to check
  * @param[in] error torch::executor::Error enum value (e.g `InvalidArgument`)
@@ -389,7 +384,8 @@
   do {                                                                    \
     if (!(cond)) {                                                        \
       ET_LOG(Error, "Check failed (%s): " message, #cond, ##__VA_ARGS__); \
-      torch::executor::runtime_abort();                                   \
+      context.fail(torch::executor::Error::error);                        \
+      return retval;                                                      \
     }                                                                     \
   } while (false)
 
@@ -491,6 +487,33 @@ inline bool tensor_is_floating_type(exec_aten::Tensor t) {
   return true;
 }
 
+inline bool tensor_is_real_type(exec_aten::Tensor t) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      torch::executor::isRealType(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_realh_type(exec_aten::Tensor t) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      torch::executor::isRealHType(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_realhb_type(exec_aten::Tensor t) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      torch::executor::isRealHBType(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
 inline bool tensor_is_complex_type(exec_aten::Tensor t) {
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
       torch::executor::isComplexType(t.scalar_type()),
diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h
index 0bdf388adc..8987d82804 100644
--- a/runtime/core/portable_type/half.h
+++ b/runtime/core/portable_type/half.h
@@ -12,6 +12,7 @@
 #include <cstdint>
 #include <cstring>
 #include <limits>
+#include <ostream>
 
 #if defined(__GNUC__) || defined(__clang__)
 #if defined(__aarch64__)
@@ -673,6 +674,13 @@ inline Half operator/(int64_t a, Half b) {
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
 /// conversion Half to float.
 
+static inline std::ostream& operator<<(
+    std::ostream& out,
+    const torch::executor::Half& value) {
+  out << (float)value;
+  return out;
+}
+
 } // namespace executor
 } // namespace torch
 
diff --git a/runtime/core/portable_type/tensor_options.h b/runtime/core/portable_type/tensor_options.h
index 2ed6d5fa4c..a6e604cf83 100644
--- a/runtime/core/portable_type/tensor_options.h
+++ b/runtime/core/portable_type/tensor_options.h
@@ -14,17 +14,21 @@ namespace torch {
 namespace executor {
 
 /**
- * Tensor data memory format. This concept only exists for compatibility
- * with ATen.
+ * Tensor data memory formats supported by ExecuTorch. This concept only exists
+ * for compatibility with ATen; use dim_order to describe non-contiguous
+ * layouts.
  */
 enum class MemoryFormat : int8_t {
   /**
-   * Row-major contiguous data format.
-   *
-   * This is the only format supported by ExecuTorch. Use dim orders to
-   * describe other layouts.
+   * Row-major contiguous data.
+   */
+  Contiguous = 0,
+  /**
+   * Output tensor format should remain the same as the input tensor format.
+   * E.g. if the input tensor is in channels_last format, operator output
+   * should be in channels_last format.
    */
-  Contiguous,
+  Preserve = 1,
 };
 
 /**
@@ -39,7 +43,7 @@ enum class Layout : int8_t {
    *
    * This is the only layout supported by ExecuTorch.
    */
-  Strided,
+  Strided = 0,
 };
 } // namespace executor
 } // namespace torch
diff --git a/runtime/platform/log.bzl b/runtime/platform/log.bzl
new file mode 100644
index 0000000000..04a3de1f12
--- /dev/null
+++ b/runtime/platform/log.bzl
@@ -0,0 +1,22 @@
+def et_logging_enabled():
+    return native.read_config("executorch", "enable_et_log", "true") == "true"
+
+def et_log_level():
+    raw_level = native.read_config("executorch", "log_level", "Info").lower()
+    if raw_level == "debug":
+        return "Debug"
+    elif raw_level == "info":
+        return "Info"
+    elif raw_level == "error":
+        return "Error"
+    elif raw_level == "fatal":
+        return "Fatal"
+    else:
+        fail("Unknown log level '{}'. Expected one of 'Debug', 'Info', 'Error', or 'Fatal'.".format(raw_level))
+
+def get_et_logging_flags():
+    if et_logging_enabled():
+        # On by default.
+        return ["-DET_MIN_LOG_LEVEL=" + et_log_level()]
+    else:
+        return ["-DET_LOG_ENABLED=0"]
diff --git a/runtime/platform/log.h b/runtime/platform/log.h
index 525686cccf..f6b986d6bf 100644
--- a/runtime/platform/log.h
+++ b/runtime/platform/log.h
@@ -148,10 +148,10 @@ inline void logf(
  */
 #define ET_LOG(_level, _format, ...)                                          \
   ({                                                                          \
-    et_timestamp_t _timestamp = torch::executor::internal::getLogTimestamp(); \
-    auto _log_level = torch::executor::LogLevel::_level;                      \
+    const auto _log_level = torch::executor::LogLevel::_level;                \
     if (static_cast<uint32_t>(_log_level) >=                                  \
         static_cast<uint32_t>(torch::executor::LogLevel::ET_MIN_LOG_LEVEL)) { \
+      const auto _timestamp = torch::executor::internal::getLogTimestamp();   \
       torch::executor::internal::logf(                                        \
           _log_level,                                                         \
           _timestamp,                                                         \
diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl
index 5b0b51e373..a4af127402 100644
--- a/runtime/platform/targets.bzl
+++ b/runtime/platform/targets.bzl
@@ -1,4 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(":log.bzl", "get_et_logging_flags")
 
 def _select_pal(dict_):
     """Returns an element of `dict_` based on the value of the
@@ -57,9 +58,6 @@ def define_common_targets():
         force_static = True,
     )
 
-    # Enable or disable ET_LOGs
-    enable_et_log = native.read_config("executorch", "enable_et_log", None)
-
     # Interfaces for executorch users
     runtime.cxx_library(
         name = "platform",
@@ -77,7 +75,7 @@ def define_common_targets():
             "profiler.cpp",
             "runtime.cpp",
         ],
-        exported_preprocessor_flags = get_profiling_flags() + (["-DET_LOG_ENABLED=0"] if enable_et_log else []),
+        exported_preprocessor_flags = get_profiling_flags() + get_et_logging_flags(),
         exported_deps = [
             "//executorch/runtime/platform:pal_interface",
             ":compiler",
diff --git a/sdk/bundled_program/bundled_program.cpp b/sdk/bundled_program/bundled_program.cpp
index 9ab94e146f..1b1769e7f7 100644
--- a/sdk/bundled_program/bundled_program.cpp
+++ b/sdk/bundled_program/bundled_program.cpp
@@ -282,7 +282,7 @@ __ET_NODISCARD Error LoadBundledInput(
             false,
             NotSupported,
             "Data type %hhu not supported",
-            bundled_input->val_type());
+            static_cast<uint8_t>(bundled_input->val_type()));
         break;
       }
     }
@@ -291,7 +291,7 @@ __ET_NODISCARD Error LoadBundledInput(
         status == Error::Ok,
         NotSupported,
         "set_input failed during load bundled inputs with status %" PRIu32,
-        status);
+        static_cast<error_code_t>(status));
   }
 
   internal::event_tracer_set_bundled_input_index(
@@ -352,7 +352,7 @@ __ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
             false,
             NotSupported,
             "Data type %hhd not supported",
-            bundled_expected_output->val_type());
+            static_cast<uint8_t>(bundled_expected_output->val_type()));
         break;
       }
     }
diff --git a/sdk/bundled_program/core.py b/sdk/bundled_program/core.py
index 3d9c7e92b6..4fede5e595 100644
--- a/sdk/bundled_program/core.py
+++ b/sdk/bundled_program/core.py
@@ -15,11 +15,7 @@
 import torch
 import torch.fx
 
-from executorch.exir import (
-    ExecutorchProgram,
-    ExecutorchProgramManager,
-    MultiMethodExecutorchProgram,
-)
+from executorch.exir import ExecutorchProgram, ExecutorchProgramManager
 from executorch.exir._serialize import _serialize_pte_binary
 from executorch.exir.tensor import get_scalar_type, scalar_type_enum, TensorSpec
 from executorch.sdk.bundled_program.config import ConfigValue, MethodTestSuite
@@ -49,7 +45,6 @@ def __init__(
         self,
         executorch_program: Union[
             ExecutorchProgram,
-            MultiMethodExecutorchProgram,
             ExecutorchProgramManager,
         ],
         method_test_suites: Sequence[MethodTestSuite],
@@ -127,7 +122,8 @@ def serialize_to_schema(self) -> bp_schema.BundledProgram:
                 )
             )
 
-        program_bytes: bytes = _serialize_pte_binary(program)
+        # TODO(T181463742): avoid calling bytes(..) which may incur large copies.
+        program_bytes: bytes = bytes(_serialize_pte_binary(program))
         self._bundled_program_in_schema = bp_schema.BundledProgram(
             version=BUNDLED_PROGRAM_SCHEMA_VERSION,
             method_test_suites=bundled_method_test_suites,
@@ -225,7 +221,6 @@ def _assert_valid_bundle(
         self,
         executorch_program: Union[
             ExecutorchProgram,
-            MultiMethodExecutorchProgram,
             ExecutorchProgramManager,
         ],
         method_test_suites: Sequence[MethodTestSuite],
@@ -366,18 +361,12 @@ def _extract_program(
         self,
         executorch_program: Union[
             ExecutorchProgram,
-            MultiMethodExecutorchProgram,
             ExecutorchProgramManager,
         ],
     ):
         if isinstance(executorch_program, ExecutorchProgramManager):
             program = executorch_program.executorch_program
-        elif isinstance(executorch_program, ExecutorchProgram):
-            program = executorch_program.program
         else:
-            assert isinstance(
-                executorch_program, MultiMethodExecutorchProgram
-            ), f"executorch_program should be in type ExecutorchProgram, MultiMethodExecutorchProgram or ExecutorchProgramManager, but got {type(executorch_program)}"
+            assert isinstance(executorch_program, ExecutorchProgram)
             program = executorch_program.program
-
         return program
diff --git a/sdk/bundled_program/test/test_bundle_data.py b/sdk/bundled_program/test/test_bundle_data.py
index 107b034c47..a8d9485c5f 100644
--- a/sdk/bundled_program/test/test_bundle_data.py
+++ b/sdk/bundled_program/test/test_bundle_data.py
@@ -69,7 +69,7 @@ def test_bundled_program(self) -> None:
 
         self.assertEqual(
             bundled_program.serialize_to_schema().program,
-            _serialize_pte_binary(executorch_program.executorch_program),
+            bytes(_serialize_pte_binary(executorch_program.executorch_program)),
         )
 
     def test_bundled_miss_methods(self) -> None:
diff --git a/sdk/etdump/etdump_flatcc.cpp b/sdk/etdump/etdump_flatcc.cpp
index 1301745c5a..4e67532ca8 100644
--- a/sdk/etdump/etdump_flatcc.cpp
+++ b/sdk/etdump/etdump_flatcc.cpp
@@ -45,7 +45,7 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type(
       ET_CHECK_MSG(
           0,
           "This ScalarType = %hhd is not yet supported in ETDump",
-          tensor_scalar_type);
+          static_cast<char>(tensor_scalar_type));
   }
 }
 
@@ -475,7 +475,7 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
       ET_CHECK_MSG(
           0,
           "This EValue type = %d is not yet supported for logging\n",
-          evalue.tag);
+          static_cast<int>(evalue.tag));
       break;
   }
 
diff --git a/sdk/etrecord/_etrecord.py b/sdk/etrecord/_etrecord.py
index f6ca45bfd1..6402e3ce9d 100644
--- a/sdk/etrecord/_etrecord.py
+++ b/sdk/etrecord/_etrecord.py
@@ -18,8 +18,6 @@
     ExecutorchProgramManager,
     ExirExportedProgram,
     ExportedProgram,
-    MultiMethodExecutorchProgram,
-    MultiMethodExirExportedProgram,
 )
 from executorch.exir.emit._emitter import _DelegateDebugIdentifierMap
 
@@ -66,30 +64,16 @@ def _handle_exported_program(
     )
 
 
-def _handle_multi_method_exported_program(
-    etrecord_zip: ZipFile,
-    module_name: str,
-    multi_method: MultiMethodExirExportedProgram,
-) -> None:
-    for method_name, ep in multi_method.methods().items():
-        _handle_exported_program(
-            etrecord_zip, module_name, method_name, ep.exported_program
-        )
-
-
 def _handle_export_module(
     etrecord_zip: ZipFile,
     export_module: Union[
-        MultiMethodExirExportedProgram,
         ExirExportedProgram,
         EdgeProgramManager,
         ExportedProgram,
     ],
     module_name: str,
 ) -> None:
-    if isinstance(export_module, MultiMethodExirExportedProgram):
-        _handle_multi_method_exported_program(etrecord_zip, module_name, export_module)
-    elif isinstance(export_module, ExirExportedProgram):
+    if isinstance(export_module, ExirExportedProgram):
         _handle_exported_program(
             etrecord_zip, module_name, "forward", export_module.exported_program
         )
@@ -151,7 +135,6 @@ def generate_etrecord(
     edge_dialect_program: Union[EdgeProgramManager, ExirExportedProgram],
     executorch_program: Union[
         ExecutorchProgram,
-        MultiMethodExecutorchProgram,
         ExecutorchProgramManager,
         BundledProgram,
     ],
@@ -160,7 +143,6 @@ def generate_etrecord(
             str,
             Union[
                 ExportedProgram,
-                MultiMethodExirExportedProgram,
                 ExirExportedProgram,
                 EdgeProgramManager,
             ],
diff --git a/sdk/inspector/_inspector.py b/sdk/inspector/_inspector.py
index e8fb52825c..91492643c8 100644
--- a/sdk/inspector/_inspector.py
+++ b/sdk/inspector/_inspector.py
@@ -354,6 +354,10 @@ def asdict(self, _units="") -> dict:
         Returns:
             A dict with the Event data
         """
+
+        def truncated_list(long_list: List[str]) -> str:
+            return f"['{long_list[0]}', '{long_list[1]}' ... '{long_list[-1]}'] ({len(long_list)} total)"
+
         return {
             "event_name": self.name,
             "raw": [self.perf_data.raw if self.perf_data else None],
@@ -363,7 +367,13 @@ def asdict(self, _units="") -> dict:
             "avg" + _units: self.perf_data.avg if self.perf_data else None,
             "min" + _units: self.perf_data.min if self.perf_data else None,
             "max" + _units: self.perf_data.max if self.perf_data else None,
-            "op_types": [self.op_types],
+            "op_types": [
+                (
+                    self.op_types
+                    if len(self.op_types) < 5
+                    else truncated_list(self.op_types)
+                )
+            ],
             "delegate_debug_identifier": self.delegate_debug_identifier,
             "stack_traces": [self.stack_traces],
             "module_hierarchy": [self.module_hierarchy],
@@ -1039,6 +1049,8 @@ def print_data_tabular(
             filtered_column_df = filtered_column_df[
                 ~filtered_column_df["event_name"].str.contains(filter_name)
             ]
+        filtered_column_df.reset_index(drop=True, inplace=True)
+
         try:
             from IPython import get_ipython
             from IPython.display import display
diff --git a/setup.py b/setup.py
index 3544960fb0..b0a1a075f2 100644
--- a/setup.py
+++ b/setup.py
@@ -229,7 +229,7 @@ def run(self):
     "egg_info": CustomEggInfoCommand,
 }
 ext_modules = None
-if os.environ.get("EXECUTORCH_BUILD_PYBIND", None):
+if os.environ.get("EXECUTORCH_BUILD_PYBIND", "OFF") == "ON":
     cmdclass["build_ext"] = CMakeBuild
     ext_modules = [CMakeExtension("executorch.extension.pybindings.portable_lib")]
 
diff --git a/shim/build_defs/package_local_utils.bzl b/shim/build_defs/package_local_utils.bzl
deleted file mode 100644
index b914516934..0000000000
--- a/shim/build_defs/package_local_utils.bzl
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under both the MIT license found in the
-# LICENSE-MIT file in the root directory of this source tree and the Apache
-# License, Version 2.0 found in the LICENSE-APACHE file in the root directory
-# of this source tree.
-
-def _set_clang_version(_version, _overwrite = False):
-    pass
-
-package_local_utils = struct(
-    set_clang_version = _set_clang_version,
-)
diff --git a/test/targets.bzl b/test/targets.bzl
index 7ccf093182..5c2a28cad1 100644
--- a/test/targets.bzl
+++ b/test/targets.bzl
@@ -36,7 +36,7 @@ def define_common_targets():
         name = "size_test_all_ops",
         srcs = SIZE_TEST_SOURCES,
         deps = SIZE_TEST_DEPS + [
-            "//executorch/kernels/portable:generated_lib_all_ops",
+            "//executorch/kernels/portable:generated_lib",
             "//executorch/runtime/executor/test:test_backend_compiler_lib",
         ],
         define_static_target = True,
@@ -57,7 +57,7 @@ def define_common_targets():
         srcs = ["multi_runner.cpp"],
         deps = [
             "//executorch/runtime/core:core",
-            "//executorch/kernels/portable:generated_lib_all_ops",
+            "//executorch/kernels/portable:generated_lib",
             "//executorch/runtime/executor:program",
             "//executorch/runtime/executor/test:managed_memory_manager",
             "//executorch/extension/data_loader:buffer_data_loader",
@@ -75,7 +75,7 @@ def define_common_targets():
         name = "relocatable_runner",
         srcs = ["relocatable_runner.cpp"],
         deps = [
-            "//executorch/kernels/portable:generated_lib_all_ops",
+            "//executorch/kernels/portable:generated_lib",
             "//executorch/runtime/executor:program",
             "//executorch/configurations:executor_cpu_optimized",
             "//executorch/extension/data_loader:buffer_data_loader",
diff --git a/third-party/lm-evaluation-harness b/third-party/lm-evaluation-harness
new file mode 160000
index 0000000000..49695e8d94
--- /dev/null
+++ b/third-party/lm-evaluation-harness
@@ -0,0 +1 @@
+Subproject commit 49695e8d94c3ab011b7ae8814d809de30b1b1182
diff --git a/third-party/pytorch b/third-party/pytorch
index f5b99976ad..6ca9ae4f86 160000
--- a/third-party/pytorch
+++ b/third-party/pytorch
@@ -1 +1 @@
-Subproject commit f5b99976adcbb01fd71bd0a39ea15bdac6c9e48a
+Subproject commit 6ca9ae4f8693639c395544327f7e362441a58c79