Update on " [ExecuTorch][BE] Split kv cache and SDPA for better code …

…sharing" Summary: Why? We have coupled SDPA with kv cache for a while. Initially this was done as we implemented sdpa_with_kv_cache custom op to reduce multiple copy overheads from kv cache update. (This could have been done by having separate custom kv cache update and custom sdpa op. Recent changes enabled this.) As a result of SDPA module owning kv cache, we get a) non-composable implementation and b) harder to reuse model definition and components from repos like tune. Output of this is that we have multiple definition of the same model, llama, lying around in ET, TorchChat and Tune. This diff and subsequent ones will try to move in the direction where custom kv cache and custom sdpa become decoupled and composable, making it more module-swap friendly with tune's model definition. How. Earlier PRs decoupled kv cache update from sdpa. So now 1. Decouple SDPA nn.Module from KV cache. 2. Standardize on KVCache and SDPA interface. That is KVCache and SDPA both operate on q, k, v in [B, # heads, seq_len, head_dim] formatted tensors. 3. 2 will introduce multiple tranposes when KVCache and SDPA are replaced by custom modules, but we will write graph pass to undo those. Test Plan: Existing tests. Make sure perf doesnt regress Differential Revision: [D67914054](https://our.internmc.facebook.com/intern/diff/D67914054) [ghstack-poisoned]
pytorch · Jan 13, 2025 · 3a6b545 · 3a6b545
2 parents d20dd95 + df1383b
commit 3a6b545
Show file tree

Hide file tree

Showing 207 changed files with 5,800 additions and 2,245 deletions.
diff --git a/.ci/docker/ci_commit_pins/buck2.txt b/.ci/docker/ci_commit_pins/buck2.txt
@@ -1 +1 @@
-2024-05-15
+2024-12-16
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-2ea4b56ec872424e486c4fe2d55da061067a2ed3
+0a94bb432ed75cc2d950d81b2921363218a7e459
diff --git a/.ci/docker/conda-env-ci.txt b/.ci/docker/conda-env-ci.txt
@@ -1,2 +1,4 @@
 cmake=3.22.1
 ninja=1.10.2
+libuv
+pkg-config
diff --git a/.ci/scripts/setup-arm-baremetal-tools.sh b/.ci/scripts/setup-arm-baremetal-tools.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# NB: This function could be used to install Arm dependencies
+# Setup arm example environment (including TOSA tools)
+git config --global user.email "github_executorch@arm.com"
+git config --global user.name "Github Executorch"
+bash examples/arm/setup.sh --i-agree-to-the-contained-eula
diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
@@ -131,5 +131,9 @@ if [[ -z "${GITHUB_RUNNER:-}" ]]; then
 fi
 
 print_cmake_info
-install_executorch
+install_pytorch_and_domains
+# We build PyTorch from source here instead of using nightly. This allows CI to test against
+# the pinned commit from PyTorch
+install_executorch "use-pt-pinned-commit"
 build_executorch_runner "${BUILD_TOOL}"
+do_not_use_nightly_on_ci
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
@@ -40,6 +40,42 @@ install_pip_dependencies() {
   popd || return
 }
 
+install_domains() {
+  echo "Install torchvision and torchaudio"
+  pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}"
+  pip install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${TORCHVISION_VERSION}"
+}
+
+install_pytorch_and_domains() {
+  pushd .ci/docker || return
+  TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
+  popd || return
+
+  git clone https://github.com/pytorch/pytorch.git
+
+  # Fetch the target commit
+  pushd pytorch || return
+  git checkout "${TORCH_VERSION}"
+  git submodule update --init --recursive
+
+  export USE_DISTRIBUTED=1
+  # Then build and install PyTorch
+  python setup.py bdist_wheel
+  pip install "$(echo dist/*.whl)"
+
+  # Grab the pinned audio and vision commits from PyTorch
+  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  export TORCHAUDIO_VERSION
+  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  export TORCHVISION_VERSION
+
+  install_domains
+
+  popd || return
+  # Print sccache stats for debugging
+  sccache --show-stats || true
+}
+
 install_flatc_from_source() {
   # NB: This function could be used to install flatbuffer from source
   pushd third-party/flatbuffers || return
@@ -59,17 +95,6 @@ install_flatc_from_source() {
   popd || return
 }
 
-install_arm() {
-  # NB: This function could be used to install Arm dependencies
-  # Setup arm example environment (including TOSA tools)
-  git config --global user.email "github_executorch@arm.com"
-  git config --global user.name "Github Executorch"
-  bash examples/arm/setup.sh --i-agree-to-the-contained-eula
-
-  # Test tosa_reference flow
-  source examples/arm/ethos-u-scratch/setup_path.sh
-}
-
 build_executorch_runner_buck2() {
   # Build executorch runtime with retry as this step is flaky on macos CI
   retry buck2 build //examples/portable/executor_runner:executor_runner

diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
@@ -410,7 +410,7 @@ jobs:
     runs-on: linux.2xlarge
     steps:
       - name: Download the apps from GitHub
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           # The name here needs to match the name of the upload-artifact parameter
           name: ios-apps

diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
@@ -53,7 +53,7 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
-      upload-artifact: ios-apps
+      upload-artifact: ios-demo-app
       script: |
         set -eux
 
@@ -83,10 +83,10 @@ jobs:
     runs-on: linux.2xlarge
     steps:
       - name: Download the artifacts from GitHub
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           # The name here needs to match the name of the upload-artifact parameter
-          name: ios-apps
+          name: ios-demo-app
           path: ${{ runner.temp }}/artifacts/
 
       - name: Verify the artifacts
@@ -216,7 +216,7 @@ jobs:
           role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-ios
           aws-region: us-east-1
       - name: Download the artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           # NB: The name here needs to match the upload-artifact name from build-frameworks-ios job
           name: executorch-frameworks-ios
@@ -291,7 +291,7 @@ jobs:
       python-version: '3.11'
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      upload-artifact: ios-apps
+      upload-artifact: ios-benchmark-app
       secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
       timeout: 90
       script: |

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -354,13 +354,11 @@ jobs:
         EXECUTORCH_BUILD_ARM_BAREMETAL=ON \
         .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
 
-        source .ci/scripts/utils.sh
         # Install Arm dependencies
-        install_arm
-
-        # Run pytest with coverage
-        pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test
+        .ci/scripts/setup-arm-baremetal-tools.sh
 
+        # Run pytest without simulator
+        backends/arm/test/test_arm_baremetal.sh test_pytest
 
   test-llama-runner-qnn-linux:
     name: test-llama-runner-qnn-linux

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -146,14 +146,15 @@ jobs:
         source .ci/scripts/utils.sh
         install_executorch
 
-        install_arm
+        .ci/scripts/setup-arm-baremetal-tools.sh
 
         # Increase number of files user can monitor to bypass buck failures.
         # Hopefully this is high enough for this setup.
         sudo sysctl fs.inotify.max_user_watches=1048576 # 1024 * 1024
 
         # Test ethos-u delegate examples with run.sh
-        PYTHON_EXECUTABLE=python bash examples/arm/run.sh examples/arm/ethos-u-scratch/
+        backends/arm/test/test_arm_baremetal.sh test_run_ethosu_fvp
+
 
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
@@ -172,10 +173,10 @@ jobs:
         source .ci/scripts/utils.sh
         install_executorch
 
-        install_arm
+        .ci/scripts/setup-arm-baremetal-tools.sh
 
-        # Run arm unit tests
-        pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test
+        # Run arm unit tests using the simulator
+        backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp
 
   test-coreml-delegate:
     name: test-coreml-delegate

diff --git a/.gitmodules b/.gitmodules
@@ -66,7 +66,7 @@
 	url = https://github.com/pybind/pybind11.git
 [submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
 	path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
-	url = https://github.com/foss-xtensa/nnlib-FusionG3/
+	url = https://github.com/foss-xtensa/nnlib-FusionG3.git
 [submodule "third-party/ao"]
 	path = third-party/ao
 	url = https://github.com/pytorch/ao.git
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -197,7 +197,7 @@ If it's not clear how to add a test for your PR, take a look at the blame for
 the code you're modifying and find an author who has more context. Ask them
 for their help in the PR comments.
 
-TODO: Explain how to run tests locally without needing to push and wait for CI.
+The `test/run_oss_cpp_tests.sh` script will build and run C++ tests locally.
 
 ### Continuous Integration
 See https://hud.pytorch.org/hud/pytorch/executorch/main for the current state of

diff --git a/backends/apple/mps/mps_preprocess.py b/backends/apple/mps/mps_preprocess.py
@@ -32,6 +32,9 @@
     CompileSpec,
     PreprocessResult,
 )
+
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
+from executorch.exir.program._program import _transform
 from torch.export.exported_program import ExportedProgram
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -83,6 +86,9 @@ def preprocess(
         #    FlatBuffer graph, process the `output` nodes and add their id to
         #    the `output_ids` array in the schema.
 
+        # TODO: Remove this once we have a better support for the dim-order ops.
+        edge_program = _transform(edge_program, DimOrderOpsRevertPass())
+
         mps_graph = MPSGraph(
             version="0",
             mps_nodes=[],

diff --git a/backends/apple/mps/operators/constant_ops.py b/backends/apple/mps/operators/constant_ops.py
@@ -79,6 +79,25 @@ def define_node(
         )
 
 
+@register_node_visitor
+class ToDimOrderEmptyVisitor(NodeVisitor):
+    target = ["dim_order_ops._empty_dim_order.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        # We should never get here, because DimOrderOpsRevertPass replaces this with an aten.empty.memory_format op
+        # But if we do, we can't handle it ATM, so raise an exception
+        raise NotImplementedError(
+            "dim_order_ops._empty_dim_order.default is not supported yet"
+        )
+
+
 @register_node_visitor
 class FullLikeVisitor(NodeVisitor):
     target = "aten.full_like.default"

diff --git a/backends/apple/mps/operators/op_clone.py b/backends/apple/mps/operators/op_clone.py
@@ -33,3 +33,22 @@ def define_node(
                 )
         input_id = self.define_tensor(get_input_node(node, 0), mps_graph)
         self.tensor_to_id[node] = input_id
+
+
+@register_node_visitor
+class ToDimOrderCopyVisitor(NodeVisitor):
+    target = ["dim_order_ops._to_dim_order_copy.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        # We should never get here, because DimOrderOpsRevertPass replaces this with an aten._to_copy op
+        # But if we do, we can't handle it ATM, so raise an exception
+        raise NotImplementedError(
+            "dim_order_ops._to_dim_order_copy.default is not supported yet"
+        )
diff --git a/backends/apple/mps/test/test_mps.py b/backends/apple/mps/test/test_mps.py
@@ -1829,6 +1829,21 @@ def forward(self, x):
             Clone(), model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
+    def test_mps_backend_to_copy(self):
+        class Copy(torch.nn.Module):
+            def forward(self, x):
+                return (
+                    torch.ops.aten._to_copy.default(
+                        x + 2, memory_format=torch.contiguous_format
+                    )
+                    + x
+                )
+
+        model_inputs = (torch.randn(1, 3, 3),)
+        self.lower_and_test_with_partitioner(
+            Copy(), model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
     def test_mps_backend_floor(self):
         class Floor(torch.nn.Module):
             def forward(self, x):

diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py
@@ -26,10 +26,7 @@
 
 # Config for Capturing the weights, will be moved in the future
 
-# TODO(T182928844): Delegate dim order op to backend.
-_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
-    _check_ir_validity=False, _skip_dim_order=True
-)
+_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(_check_ir_validity=False)
 
 
 class ansi_colors:
@@ -219,7 +216,6 @@ def lower_module_and_test_output(
             dynamic_shapes=dynamic_shapes,
             edge_compile_config=EdgeCompileConfig(
                 _check_ir_validity=False,
-                _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
             ),
         )
 
@@ -250,7 +246,6 @@ def lower_module_and_test_output(
                 export(delegated_program, sample_inputs, strict=True),
                 compile_config=exir.EdgeCompileConfig(
                     _check_ir_validity=False,
-                    _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
                 ),
             ).to_executorch(
                 config=ExecutorchBackendConfig(extract_delegate_segments=False)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		2ea4b56ec872424e486c4fe2d55da061067a2ed3
		0a94bb432ed75cc2d950d81b2921363218a7e459