diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh index ebed4a3150..be684b7bfa 100644 --- a/.ci/scripts/utils.sh +++ b/.ci/scripts/utils.sh @@ -17,7 +17,7 @@ retry () { } clean_executorch_install_folders() { - ./install_requirements.sh --clean + ./install_executorch.sh --clean } install_executorch() { @@ -25,9 +25,9 @@ install_executorch() { # Install executorch, this assumes that Executorch is checked out in the # current directory. if [[ "${1:-}" == "use-pt-pinned-commit" ]]; then - ./install_requirements.sh --pybind xnnpack --use-pt-pinned-commit + ./install_executorch.sh --pybind xnnpack --use-pt-pinned-commit else - ./install_requirements.sh --pybind xnnpack + ./install_executorch.sh --pybind xnnpack fi # Just print out the list of packages for debugging pip list diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 2e85eeec76..8ac755bf5d 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -9,7 +9,7 @@ on: paths: - .ci/scripts/setup-ios.sh - .github/workflows/apple.yml - - install_requirements.sh + - install_executorch.sh - backends/apple/** - build/build_apple_frameworks.sh - build/build_apple_llm_demo.sh diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 8b32e46cf2..dbe0e872ac 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -200,7 +200,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" # install pybind - bash install_requirements.sh --pybind xnnpack + bash install_executorch.sh --pybind xnnpack # install Llava requirements bash examples/models/llama/install_requirements.sh @@ -333,6 +333,9 @@ jobs: unittest-arm: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-arm-sdk @@ -433,7 +436,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" # install pybind - bash install_requirements.sh --pybind xnnpack + bash install_executorch.sh --pybind xnnpack # install phi-3-mini requirements bash examples/models/phi-3-mini/install_requirements.sh @@ -460,7 +463,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" # install pybind - bash install_requirements.sh --pybind xnnpack + bash install_executorch.sh --pybind xnnpack # install llama requirements bash examples/models/llama/install_requirements.sh @@ -487,7 +490,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" # install pybind - bash install_requirements.sh --pybind xnnpack + bash install_executorch.sh --pybind xnnpack # install llama requirements bash examples/models/llama/install_requirements.sh @@ -514,7 +517,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" # install pybind - bash install_requirements.sh --pybind xnnpack + bash install_executorch.sh --pybind xnnpack # install llama requirements bash examples/models/llama/install_requirements.sh diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 90bd0eb6ef..0cbbe6f643 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -132,6 +132,9 @@ jobs: test-arm-backend-delegation: name: test-arm-backend-delegation uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-arm-sdk @@ -159,6 +162,9 @@ jobs: test-arm-reference-delegation: name: test-arm-reference-delegation uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-arm-sdk diff --git a/backends/apple/mps/setup.md b/backends/apple/mps/setup.md index 7cd4c240a4..8234487653 100644 --- a/backends/apple/mps/setup.md +++ b/backends/apple/mps/setup.md @@ -97,7 +97,7 @@ I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successf ### [Optional] Run the generated model directly using pybind 1. Make sure `pybind` MPS support was installed: ```bash -./install_requirements.sh --pybind mps +./install_executorch.sh --pybind mps ``` 2. Run the `mps_example` script to trace the model and run it directly from python: ```bash diff --git a/backends/cadence/build_cadence_fusionG3.sh b/backends/cadence/build_cadence_fusionG3.sh index 7a4dd68fb3..7cdcc38c04 100644 --- a/backends/cadence/build_cadence_fusionG3.sh +++ b/backends/cadence/build_cadence_fusionG3.sh @@ -12,7 +12,7 @@ unset XTENSA_CORE export XTENSA_CORE=FCV_FG3GP git submodule sync git submodule update --init -./install_requirements.sh +./install_executorch.sh rm -rf cmake-out diff --git a/backends/cadence/build_cadence_hifi4.sh b/backends/cadence/build_cadence_hifi4.sh index 28a3812752..603f3b85f7 100644 --- a/backends/cadence/build_cadence_hifi4.sh +++ b/backends/cadence/build_cadence_hifi4.sh @@ -12,7 +12,7 @@ unset XTENSA_CORE export XTENSA_CORE=nxp_rt600_RI23_11_newlib git submodule sync git submodule update --init -./install_requirements.sh +./install_executorch.sh rm -rf cmake-out diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp index cff50f2a90..3e0235170b 100644 --- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp +++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp @@ -67,8 +67,8 @@ void check_dequantize_per_tensor_args( ET_CHECK_MSG( input.scalar_type() == dtype, - "input.scalar_type() %" PRId8 " is not matching dtype argumenta:", - static_cast(input.scalar_type())); + "input.scalar_type() %s is not matching dtype arguments:", + ::executorch::runtime::toString(input.scalar_type())); if (out_dtype.has_value()) { ET_CHECK_MSG( @@ -561,11 +561,12 @@ Tensor& dequantize_per_tensor_out( const Tensor& input, double scale, int64_t zero_point, - int64_t quant_min, - int64_t quant_max, + __ET_UNUSED int64_t quant_min, + __ET_UNUSED int64_t quant_max, ScalarType dtype, - ::executorch::aten::optional out_dtype, Tensor& out) { + constexpr ScalarType out_dtype = ScalarType::Float; + #ifdef OP_ARG_CHECK torch::executor::Error err = resize_tensor(out, input.sizes()); ET_CHECK_MSG( diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp index 5a7af85809..b18159a0b3 100644 --- a/backends/cadence/reference/operators/quantized_conv_out.cpp +++ b/backends/cadence/reference/operators/quantized_conv_out.cpp @@ -119,7 +119,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( if (((_h + d0 * _wh - p0) >= 0) && ((_h + d0 * _wh - p0) < h) && ((_w + d1 * _ww - p1) >= 0) && - ((_w + d1 * _ww - p1 < w))) { + ((_w + d1 * _ww - p1) < w)) { int ioff = (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1); int woff = _wh * ww + _ww; diff --git a/backends/qualcomm/_passes/insert_requantize.py b/backends/qualcomm/_passes/insert_requantize.py index 11aad02a0c..83b729f3c4 100644 --- a/backends/qualcomm/_passes/insert_requantize.py +++ b/backends/qualcomm/_passes/insert_requantize.py @@ -89,15 +89,9 @@ def _single_output_annotation( requantize_dict = n.meta.pop(QCOM_REQUANTIZE) # {quant_attr: user_node_name_list} group_quant_attr_dict = self._invert_dict(requantize_dict) - # TODO: If users of the node contain output node, - # we replace the node with to_copy op. However, it would - # be problem when the node has multiple to_copy ops - add_output = len(group_quant_attr_dict) == 1 for hashable_quant_attr, user_nodes in group_quant_attr_dict.items(): user_nodes_copy = user_nodes.copy() - if add_output: - user_nodes_copy.append("output") self._insert_to_copy(gm, n, dict(hashable_quant_attr), user_nodes_copy) def _insert(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule: diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py index 6167df64b9..ba397273b6 100644 --- a/backends/qualcomm/quantizer/custom_annotation.py +++ b/backends/qualcomm/quantizer/custom_annotation.py @@ -14,17 +14,80 @@ QuantizationConfig, ) from executorch.exir.dialects._ops import ops as exir_ops -from torch.ao.quantization.observer import MinMaxObserver +from torch.ao.quantization.observer import FixedQParamsObserver, MinMaxObserver from torch.ao.quantization.quantizer import ( QuantizationAnnotation, + QuantizationSpec, SharedQuantizationSpec, ) from torch.fx import Node -def annotate_matmul_16a8w( # noqa: C901 - gm: torch.fx.GraphModule, traverse_input1=True -) -> None: +def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None: + def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None: + input_qspec_map = {} + input_act = node.args[0] + input_spec = quantization_config.input_activation + input_qspec_map[input_act] = input_spec + + weight = node.args[1] + input_qspec_map[weight] = quantization_config.weight + + node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=quantization_config.output_activation, + _annotated=True, + ) + + quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config( + torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver + ) + for node in gm.graph.nodes: + if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default: + if "nn_module_stack" in node.meta: + module_values_list = list(node.meta["nn_module_stack"].values()) + full_qualified_name = module_values_list[-1][0] + if full_qualified_name == "output.conv": + annotate_conv2d( + node, quantization_config=quantization_config_16a8w_per_channel + ) + + +def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict): + for node in gm.graph.nodes: + if node.op == "output": + for index, prefill_output in enumerate(node.args[0]): + kv_quant_attr = kv_quant_attrs[index] + fixed_observer = FixedQParamsObserver.with_args( + scale=kv_quant_attr[0], + zero_point=kv_quant_attr[1], + quant_min=kv_quant_attr[2], + quant_max=kv_quant_attr[3], + dtype=kv_quant_attr[4], + qscheme=torch.torch.per_tensor_affine, + ) + + fixed_output_spec = QuantizationSpec( + quant_min=kv_quant_attr[2], + quant_max=kv_quant_attr[3], + dtype=kv_quant_attr[4], + ch_axis=0, + observer_or_fake_quant_ctr=fixed_observer, + ) + + input_qspec_map = {} + for input in prefill_output.args: + if isinstance(input, Node): + input_qspec_map[input] = fixed_output_spec + + prefill_output.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=fixed_output_spec, + _annotated=True, + ) + + +def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901 """ This function is specific for matmul op 16a8w. For k, we will tag such as the below, and @@ -142,8 +205,7 @@ def annotate_matmul_input1(node: Node): for node in gm.graph.nodes: if node.op == "call_function" and node.target == torch.ops.aten.matmul.default: annotate_matmul(node, quantization_config_16a8w) - if traverse_input1: - annotate_matmul_input1(node.args[1]) + annotate_matmul_input1(node.args[1]) def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901 diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 73ca1820f3..55596cf038 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -3280,7 +3280,7 @@ def test_stories_single_llama(self): cmds = [ "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama2/llama.py", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", "--artifact", self.artifact_dir, "--build_folder", @@ -3307,6 +3307,8 @@ def test_stories_single_llama(self): "16a4w", "--temperature", "0", + "--llama_model", + "stories110m", ] if self.host: cmds.extend(["--host", self.host]) diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index ce23eb989f..7eab1c21f8 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -81,7 +81,7 @@ First, build and install ExecuTorch libraries, then build the LLaMA runner binary using the Android NDK toolchain. ```shell -./install_requirements.sh --clean +./install_executorch.sh --clean (mkdir cmake-android-out && \ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index a5a2097cd5..f72c487fa7 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -33,7 +33,9 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; // shared memory to hold calculated positions, this would reduce register usage thus improving performance. -shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE]; +// 64 is the number of threads in the local wg +$num_shared = 64 * TILE_SIZE * TILE_SIZE +shared ivec2 pos_shared[${num_shared}]; /* * Computes a 2D pointwise convolution of an NxN output tile. Calculating an diff --git a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp index 63ebb96cfa..66a585844c 100644 --- a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp +++ b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp @@ -30,16 +30,38 @@ at::Tensor linear_weight_int4_reference_impl( const size_t ndim = original_x_size.size(); const int64_t out_features = weights_4x2.size(0); const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]}); - const at::Tensor packed_weights = - at::_convert_weight_to_int4pack(weights_4x2, inner_k_tiles); - at::Tensor out = at::_weight_int4pack_mm( - x_flattened, packed_weights, groupsize, scales_and_zeros); + at::Tensor out = at::_weight_int4pack_mm_for_cpu( + x_flattened, weights_4x2, groupsize, scales_and_zeros); std::vector out_shape( original_x_size.begin(), original_x_size.end()); out_shape.at(ndim - 1) = out_features; return out.reshape(out_shape); } +at::Tensor unpack_weights_4x2(const at::Tensor& weights_4x2) { + std::vector weights_shape(weights_4x2.sizes().vec()); + weights_shape[1] *= 2; + + at::Tensor weights_unpacked = + at::empty(weights_shape, at::device(at::kCPU).dtype(at::kInt)); + + const int64_t N = weights_unpacked.size(0); + const int64_t K = weights_unpacked.size(1); + + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k += 2) { + const uint8_t packed_val = weights_4x2[n][k / 2].item().to(); + const uint8_t second_val = packed_val & 0x0F; + const uint8_t first_val = (packed_val & 0xF0) >> 4; + + weights_unpacked[n][k] = int(first_val); + weights_unpacked[n][k + 1] = int(second_val); + } + } + + return weights_unpacked; +} + at::Tensor dequantize_and_linear( const at::Tensor& x, const at::Tensor& weights_4x2, @@ -91,13 +113,18 @@ void test_reference_linear_int4( at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat)); at::Tensor weights_4x2 = at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte)); + at::Tensor weights_int = unpack_weights_4x2(weights_4x2); const int k_groups = K / group_size; at::Tensor scales_and_zeros = at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat)); at::Tensor out = linear_weight_int4_reference_impl( - x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles); + x, + at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size), + group_size, + scales_and_zeros, + inner_k_tiles); at::Tensor out_ref = dequantize_and_linear( x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles); diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md index 2184257b79..967a852599 100644 --- a/backends/xnnpack/README.md +++ b/backends/xnnpack/README.md @@ -98,7 +98,7 @@ After exporting the XNNPACK Delegated model, we can now try running it with exam cd executorch # Get a clean cmake-out directory -./install_requirements.sh --clean +./install_executorch.sh --clean mkdir cmake-out # Configure cmake diff --git a/build/test_ios.sh b/build/test_ios.sh index 150a974cc1..cdc02098ce 100755 --- a/build/test_ios.sh +++ b/build/test_ios.sh @@ -63,7 +63,7 @@ say "Installing Requirements" pip install --upgrade cmake pip setuptools wheel zstd -./install_requirements.sh --pybind coreml mps xnnpack +./install_executorch.sh --pybind coreml mps xnnpack export PATH="$(realpath third-party/flatbuffers/cmake-out):$PATH" ./build/install_flatc.sh diff --git a/docs/README.md b/docs/README.md index c65bd8474e..dd1fded5aa 100644 --- a/docs/README.md +++ b/docs/README.md @@ -65,7 +65,7 @@ To build the documentation locally: 1. Run: ```bash - bash install_requirements.sh + bash install_executorch.sh ``` 1. Go to the `docs/` directory. diff --git a/docs/source/apple-runtime.md b/docs/source/apple-runtime.md index bfda12db21..fe744add52 100644 --- a/docs/source/apple-runtime.md +++ b/docs/source/apple-runtime.md @@ -109,7 +109,7 @@ python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip 4. Install the required dependencies, including those needed for the backends like [Core ML](build-run-coreml.md) or [MPS](build-run-mps.md), if you plan to build them as well: ```bash -./install_requirements.sh --pybind coreml mps xnnpack +./install_executorch.sh --pybind coreml mps xnnpack # Optional dependencies for Core ML backend. ./backends/apple/coreml/scripts/install_requirements.sh diff --git a/docs/source/build-run-xtensa.md b/docs/source/build-run-xtensa.md index bc90ee5292..6097c9095a 100644 --- a/docs/source/build-run-xtensa.md +++ b/docs/source/build-run-xtensa.md @@ -162,7 +162,7 @@ In order to run the CMake build, you need the path to the following: ```bash cd executorch -./install_requirements.sh --clean +./install_executorch.sh --clean mkdir cmake-out # prebuild and install executorch library cmake -DCMAKE_TOOLCHAIN_FILE=/backends/cadence/cadence.cmake \ diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md index e21a73195c..fe0b042e60 100644 --- a/docs/source/getting-started-setup.md +++ b/docs/source/getting-started-setup.md @@ -92,23 +92,23 @@ Alternatively, if you would like to experiment with ExecuTorch quickly and easil # Install ExecuTorch pip package and its dependencies, as well as # development tools like CMake. # If developing on a Mac, make sure to install the Xcode Command Line Tools first. - ./install_requirements.sh + ./install_executorch.sh ``` - Use the [`--pybind` flag](https://github.com/pytorch/executorch/blob/main/install_requirements.sh#L26-L29) to install with pybindings and dependencies for other backends. + Use the [`--pybind` flag](https://github.com/pytorch/executorch/blob/main/install_executorch.sh#L26-L29) to install with pybindings and dependencies for other backends. ```bash - ./install_requirements.sh --pybind + ./install_executorch.sh --pybind # Example: pybindings with CoreML *only* - ./install_requirements.sh --pybind coreml + ./install_executorch.sh --pybind coreml # Example: pybinds with CoreML *and* XNNPACK - ./install_requirements.sh --pybind coreml xnnpack + ./install_executorch.sh --pybind coreml xnnpack ``` - By default, `./install_requirements.sh` command installs pybindings for XNNPACK. To disable any pybindings altogether: + By default, `./install_executorch.sh` command installs pybindings for XNNPACK. To disable any pybindings altogether: ```bash - ./install_requirements.sh --pybind off + ./install_executorch.sh --pybind off ``` After setting up your environment, you are ready to convert your PyTorch programs @@ -125,7 +125,7 @@ to ExecuTorch. > > ```bash > # From the root of the executorch repo: -> ./install_requirements.sh --clean +> ./install_executorch.sh --clean > git submodule sync > git submodule update --init > ``` @@ -208,7 +208,7 @@ The ExecuTorch repo uses CMake to build its C++ code. Here, we'll configure it t ```bash # Clean and configure the CMake build system. Compiled programs will # appear in the executorch/cmake-out directory we create here. - ./install_requirements.sh --clean + ./install_executorch.sh --clean (mkdir cmake-out && cd cmake-out && cmake ..) # Build the executor_runner target @@ -226,7 +226,7 @@ The ExecuTorch repo uses CMake to build its C++ code. Here, we'll configure it t > > ```bash > # From the root of the executorch repo: -> ./install_requirements.sh --clean +> ./install_executorch.sh --clean > git submodule sync > git submodule update --init > ``` diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index 9f88d7de36..f0de7cc9c9 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -52,7 +52,7 @@ git submodule update --init # Create a conda environment and install requirements. conda create -yn executorch python=3.10.0 conda activate executorch -./install_requirements.sh +./install_executorch.sh cd ../.. ``` @@ -83,7 +83,7 @@ cd third-party/executorch git submodule update --init # Install requirements. -PYTHON_EXECUTABLE=python ./install_requirements.sh +PYTHON_EXECUTABLE=python ./install_executorch.sh cd ../.. ``` @@ -396,7 +396,7 @@ At this point, the working directory should contain the following files: If all of these are present, you can now build and run: ```bash -./install_requirements.sh --clean +./install_executorch.sh --clean (mkdir cmake-out && cd cmake-out && cmake ..) cmake --build cmake-out -j10 ./cmake-out/nanogpt_runner diff --git a/docs/source/runtime-build-and-cross-compilation.md b/docs/source/runtime-build-and-cross-compilation.md index f30d2d28d1..3574b76b6d 100644 --- a/docs/source/runtime-build-and-cross-compilation.md +++ b/docs/source/runtime-build-and-cross-compilation.md @@ -45,7 +45,7 @@ cd executorch # Clean and configure the CMake build system. It's good practice to do this # whenever cloning or pulling the upstream repo. -./install_requirements.sh --clean +./install_executorch.sh --clean (mkdir cmake-out && cd cmake-out && cmake ..) ``` @@ -122,7 +122,7 @@ Following are instruction on how to perform cross compilation for Android and iO Assuming Android NDK is available, run: ```bash # Run the following lines from the `executorch/` folder -./install_requirements.sh --clean +./install_executorch.sh --clean mkdir cmake-android-out && cd cmake-android-out # point -DCMAKE_TOOLCHAIN_FILE to the location where ndk is installed diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md index f4579e2cce..c81f61878c 100644 --- a/docs/source/tutorial-xnnpack-delegate-lowering.md +++ b/docs/source/tutorial-xnnpack-delegate-lowering.md @@ -147,7 +147,7 @@ After exporting the XNNPACK Delegated model, we can now try running it with exam cd executorch # Get a clean cmake-out directory -./install_requirements.sh --clean +./install_executorch.sh --clean mkdir cmake-out # Configure cmake diff --git a/examples/cadence/operators/facto_util.py b/examples/cadence/operators/facto_util.py new file mode 100644 index 0000000000..e708796c7b --- /dev/null +++ b/examples/cadence/operators/facto_util.py @@ -0,0 +1,117 @@ +# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +# pyre-strict + +import copy +from typing import List, OrderedDict, Tuple + +import torch +from inputgen.argtuple.gen import ArgumentTupleGenerator +from inputgen.specs.model import ConstraintProducer as cp +from inputgen.utils.random_manager import random_manager +from inputgen.variable.type import ScalarDtype +from specdb.db import SpecDictDB + +# seed to generate identical cases every run to reproduce from bisect +random_manager.seed(1729) + + +def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> None: + match op_name: + case ( + "sigmoid.default" + | "_softmax.default" + | "rsqrt.default" + | "exp.default" + | "mul.Tensor" + | "div.Tensor" + ): + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float]), + cp.Size.Le(lambda deps, r, d: 2), + cp.Rank.Le(lambda deps: 2), + ] + ) + case ( + "add.Tensor" + | "sub.Tensor" + | "add.Scalar" + | "sub.Scalar" + | "mul.Scalar" + | "div.Scalar" + ): + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float, torch.int32]), + cp.Size.Le(lambda deps, r, d: 2), + cp.Rank.Le(lambda deps: 2), + ] + ) + case "native_layer_norm.default": + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float, torch.int32]), + cp.Size.Le(lambda deps, r, d: 2**4), + cp.Rank.Le(lambda deps: 2**4), + ] + ) + case _: + tensor_constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.float, torch.int32]), + cp.Size.Le(lambda deps, r, d: 2), + cp.Rank.Le(lambda deps: 2), + ] + ) + tensor_constraints.extend( + [ + cp.Value.Ge(lambda deps, dtype, struct: -(2**8)), + cp.Value.Le(lambda deps, dtype, struct: 2**8), + cp.Rank.Ge(lambda deps: 1), + cp.Size.Ge(lambda deps, r, d: 1), + ] + ) + + +def apply_scalar_contraints(op_name: str) -> list[ScalarDtype]: + match op_name: + case "add.Scalar" | "sub.Scalar" | "mul.Scalar" | "div.Scalar": + return [ScalarDtype.int] + case _: + return [ScalarDtype.float, ScalarDtype.int] + + +def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, str]]]: + # minimal example to test add.Tensor using FACTO + spec = SpecDictDB[op_name] + tensor_constraints = [] + # common tensor constraints + apply_tensor_contraints(op_name, tensor_constraints) + + for index, in_spec in enumerate(copy.deepcopy(spec.inspec)): + if in_spec.type.is_scalar(): + if in_spec.name != "alpha": + spec.inspec[index].constraints.extend( + [ + cp.Dtype.In(lambda deps: apply_scalar_contraints(op_name)), + cp.Value.Ge(lambda deps, dtype: -(2**8)), + cp.Value.Le(lambda deps, dtype: 2**2), + cp.Size.Ge(lambda deps, r, d: 1), + cp.Size.Le(lambda deps, r, d: 2**2), + ] + ) + else: + spec.inspec[index].constraints.extend( + [ + cp.Value.Gt(lambda deps, dtype: 0), + cp.Value.Le(lambda deps, dtype: 2), + ] + ) + elif in_spec.type.is_tensor(): + spec.inspec[index].constraints.extend(tensor_constraints) + + return [ + (posargs, inkwargs) + for posargs, inkwargs, _ in ArgumentTupleGenerator(spec).gen() + ] diff --git a/examples/cadence/operators/targets.bzl b/examples/cadence/operators/targets.bzl index e1fbeb9fdf..32dc9061b5 100644 --- a/examples/cadence/operators/targets.bzl +++ b/examples/cadence/operators/targets.bzl @@ -5,9 +5,11 @@ # LICENSE file in the root directory of this source tree. load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") TESTS_LIST = [ "add_op", + "g3_ops", "quantized_conv1d_op", "quantized_linear_op", ] @@ -16,6 +18,19 @@ def define_common_targets(): for op in TESTS_LIST: _define_test_target(op) + python_library( + name = "facto_util", + srcs = [ + "facto_util.py", + ], + typing = True, + deps = [ + "fbcode//caffe2:torch", + "fbcode//pytorch/facto:inputgen", + "fbcode//pytorch/facto:specdb", + ], + ) + def _define_test_target(test_name): file_name = "test_{}".format(test_name) @@ -32,5 +47,6 @@ def _define_test_target(test_name): "fbcode//executorch/backends/cadence/aot:ops_registrations", "fbcode//executorch/backends/cadence/aot:export_example", "fbcode//executorch/backends/cadence/aot:compiler", + "fbcode//executorch/examples/cadence/operators:facto_util", ], ) diff --git a/examples/cadence/operators/test_g3_ops.py b/examples/cadence/operators/test_g3_ops.py new file mode 100644 index 0000000000..158e13d389 --- /dev/null +++ b/examples/cadence/operators/test_g3_ops.py @@ -0,0 +1,264 @@ +import unittest +from typing import Any, cast, List, OrderedDict, Tuple + +from executorch.examples.cadence.operators import facto_util + +from parameterized import parameterized + +from executorch.backends.cadence.aot.ops_registrations import * # noqa + +import torch +import torch.nn as nn +from executorch.backends.cadence.aot.export_example import export_model + + +class ATenOpTestCases(unittest.TestCase): + def run_and_verify(self, model: nn.Module, inputs: Tuple[Any, ...]) -> None: + model.eval() + export_model( + model, inputs, file_name=self._testMethodName, run_and_compare=False + ) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("add.Tensor")]) + @torch.no_grad() + def test_g3_add_tensor_out( + self, + posargs: List[str], + inkwargs: OrderedDict[str, str], + ) -> None: + class AddTensor(nn.Module): + def __init__(self, alpha: float): + super().__init__() + self.alpha = alpha + + def forward(self, x: torch.Tensor, y: torch.Tensor): + return torch.add(x, y, alpha=self.alpha) + + model = AddTensor(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("add.Scalar")]) + @torch.no_grad() + def test_aten_add_Scalar_out( + self, + posargs: List[str], + inkwargs: OrderedDict[str, str], + ) -> None: + class AddScalar(nn.Module): + def __init__(self, alpha: float): + super().__init__() + self.alpha = alpha + + def forward(self, x: torch.Tensor, y: float): + return torch.add(x, y, alpha=self.alpha) + + inputs = posargs[:-1] # posargs = [x_tensor, y_scalar, alpha_scalar] + alpha = posargs[-1] + model = AddScalar(alpha) + + self.run_and_verify(model, tuple(inputs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("sub.Tensor")]) + @torch.no_grad() + def test_g3_sub_tensor_out( + self, + posargs: List[str], + inkwargs: OrderedDict[str, str], + ) -> None: + class SubTensor(nn.Module): + def __init__(self, alpha: float): + super().__init__() + self.alpha = alpha + + def forward(self, x: torch.Tensor, y: torch.Tensor): + return torch.sub(x, y, alpha=self.alpha) + + model = SubTensor(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("sub.Scalar")]) + @torch.no_grad() + def test_g3_sub_scalar_out( + self, + posargs: List[str], + inkwargs: OrderedDict[str, str], + ) -> None: + # Tensor-Scalar subtraction + class SubScalar(torch.nn.Module): + def __init__(self, other): + super().__init__() + self.other = other + + def forward(self, x): + return torch.ops.aten.sub.Scalar(x, self.other) + + inputs = posargs[0] # posargs = [x_tensor, y_scalar, alpha_scalar] + model = SubScalar(posargs[1]) + + self.run_and_verify(model, (inputs,)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("div.Tensor")]) + @torch.no_grad() + def test_g3_div_tensor_out( + self, + posargs: List[str], + inkwargs: OrderedDict[str, str], + ) -> None: + class DivTensor(nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor): + return torch.div(x, y + 1) + + model = DivTensor(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("div.Scalar")]) + @torch.no_grad() + def test_g3_div_scalar_out( + self, + posargs: List[str], + inkwargs: OrderedDict[str, str], + ) -> None: + class DivScalar(nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor): + return torch.div(x, y + 1) + + model = DivScalar(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("exp.default")]) + @torch.no_grad() + def test_g3_exp_out( + self, + posargs: List[str], + inkwargs: OrderedDict[str, str], + ) -> None: + class Exp(nn.Module): + def forward(self, x: torch.Tensor): + return torch.exp(x) + + model = Exp(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("mul.Tensor")]) + @torch.no_grad() + def test_g3_mul_tensor_out( + self, + posargs: List[str], + inkwargs: OrderedDict[str, str], + ) -> None: + class MulTensor(nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor): + return x * y + + model = MulTensor(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("mul.Scalar")]) + @torch.no_grad() + def test_g3_mul_scalar_out( + self, + posargs: List[str], + inkwargs: OrderedDict[str, str], + ) -> None: + class MulScalar(nn.Module): + def forward(self, x: torch.Tensor, y: torch.Tensor): + return x * y + + model = MulScalar(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("native_layer_norm.default")]) + @torch.no_grad() + def test_g3_native_layer_norm_out( + self, + posargs: List[int], + inkwargs: OrderedDict[str, str], + ) -> None: + inputs, normalized_shape, weight, bias, _ = posargs + model = nn.LayerNorm(normalized_shape, eps=1e-5) + if weight is not None: + weight = cast(torch.Tensor, weight) + model.weight = nn.Parameter(torch.rand_like(weight)) + if bias is not None: + bias = cast(torch.Tensor, bias) + model.bias = nn.Parameter(torch.rand_like(bias)) + + self.run_and_verify(model, (inputs,)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("neg.default")]) + @torch.no_grad() + def test_g3_neg_out( + self, + posargs: List[int], + inkwargs: OrderedDict[str, str], + ) -> None: + class Neg(nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.neg(x) + + model = Neg(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("rsqrt.default")]) + @torch.no_grad() + def test_g3_rsqrt_out( + self, + posargs: List[int], + inkwargs: OrderedDict[str, str], + ) -> None: + class Rsqrt(nn.Module): + def forward(self, x: torch.Tensor): + return torch.ops.aten.rsqrt(x) + + model = Rsqrt(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("sigmoid.default")]) + @torch.no_grad() + def test_g3_sigmoid_out( + self, + posargs: List[int], + inkwargs: OrderedDict[str, str], + ) -> None: + model = nn.Sigmoid(**inkwargs) + + self.run_and_verify(model, tuple(posargs)) + + # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`. + @parameterized.expand([*facto_util.facto_testcase_gen("_softmax.default")]) + @torch.no_grad() + def test_g3__softmax_out( + self, + posargs: List[int], + inkwargs: OrderedDict[str, str], + ) -> None: + inputs, _, _ = posargs + model = nn.Softmax(dim=-1) + + self.run_and_verify(model, (inputs,)) + + +if __name__ == "__main__": + unittest.main() diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md index 1feb9ca92d..931509891a 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/README.md +++ b/examples/demo-apps/android/ExecuTorchDemo/README.md @@ -70,7 +70,7 @@ export ANDROID_NDK= export ANDROID_ABI=arm64-v8a # Run the following lines from the `executorch/` folder -./install_requirements.sh --clean +./install_executorch.sh --clean mkdir cmake-android-out # Build the core executorch library @@ -114,7 +114,7 @@ export ANDROID_NDK= export ANDROID_ABI=arm64-v8a export QNN_SDK_ROOT= -./install_requirements.sh --clean +./install_executorch.sh --clean mkdir cmake-android-out cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md index b80ea51646..6351640dcc 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md @@ -36,7 +36,7 @@ git submodule update --init ``` Install dependencies ``` -./install_requirements.sh +./install_executorch.sh ``` ## Setup Environment Variables ### Download Buck2 and make executable diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md index 7790f66923..92afe613f7 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md @@ -34,7 +34,7 @@ git submodule update --init ``` Install dependencies ``` -./install_requirements.sh +./install_executorch.sh ``` ## Setup QNN diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md index 087bd24260..a2ac04ae93 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md @@ -36,12 +36,12 @@ git submodule update --init ``` Install dependencies ``` -./install_requirements.sh +./install_executorch.sh ``` Optional: Use the --pybind flag to install with pybindings. ``` -./install_requirements.sh --pybind xnnpack +./install_executorch.sh --pybind xnnpack ``` diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md b/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md index 4a4e682a4a..844c83d220 100644 --- a/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md +++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/README.md @@ -51,7 +51,7 @@ python3 -m venv .venv && source .venv/bin/activate pip install --upgrade cmake pip setuptools wheel -./install_requirements.sh --pybind coreml mps xnnpack +./install_executorch.sh --pybind coreml mps xnnpack ``` ### 4. Backend Dependencies diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md index eb3c244dee..bfe66bbd4e 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md @@ -33,7 +33,7 @@ git submodule update --init Install dependencies ``` -./install_requirements.sh +./install_executorch.sh ``` ## Prepare Models diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md index 1e03993c94..b357628042 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md @@ -32,11 +32,11 @@ git submodule update --init Install dependencies ``` -./install_requirements.sh +./install_executorch.sh ``` Optional: Use the --pybind flag to install with pybindings. ``` -./install_requirements.sh --pybind xnnpack +./install_executorch.sh --pybind xnnpack ``` ## Prepare Models In this demo app, we support text-only inference with up-to-date Llama models and image reasoning inference with LLaVA 1.5. diff --git a/examples/demo-apps/react-native/rnllama/README.md b/examples/demo-apps/react-native/rnllama/README.md index 76c44a6621..33c607d635 100644 --- a/examples/demo-apps/react-native/rnllama/README.md +++ b/examples/demo-apps/react-native/rnllama/README.md @@ -26,7 +26,7 @@ A React Native mobile application for running LLaMA language models using ExecuT 3. Pull submodules: `git submodule sync && git submodule update --init` -4. Install dependencies: `./install_requirements.sh --pybind xnnpack && ./examples/models/llama/install_requirements.sh` +4. Install dependencies: `./install_executorch.sh --pybind xnnpack && ./examples/models/llama/install_requirements.sh` 5. Follow the instructions in the [README](https://github.com/pytorch/executorch/blob/main/examples/models/llama/README.md#option-a-download-and-export-llama32-1b3b-model) to export a model as `.pte` @@ -40,4 +40,4 @@ A React Native mobile application for running LLaMA language models using ExecuT 10. Select the model and tokenizer in the app to start chatting: -[![rnllama]](https://github.com/user-attachments/assets/b339f1ec-8b80-41f0-b3f6-ded6698ac926) \ No newline at end of file +[![rnllama]](https://github.com/user-attachments/assets/b339f1ec-8b80-41f0-b3f6-ded6698ac926) diff --git a/examples/devtools/build_example_runner.sh b/examples/devtools/build_example_runner.sh index 693996940d..b6a14bcfb1 100755 --- a/examples/devtools/build_example_runner.sh +++ b/examples/devtools/build_example_runner.sh @@ -37,7 +37,7 @@ done main() { cd "${EXECUTORCH_ROOT}" - ./install_requirements.sh --clean + ./install_executorch.sh --clean if [[ "${BUILD_COREML}" == "ON" ]]; then cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index cf9553c1c6..7a8838fb01 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -148,7 +148,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus ## Step 1: Setup > :warning: **double check your python environment**: make sure `conda activate ` is run before all the bash and python scripts. -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack` +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh --pybind xnnpack` 2. Run `examples/models/llama/install_requirements.sh` to install a few dependencies. @@ -440,8 +440,8 @@ This example tries to reuse the Python code, with minimal modifications to make ``` git clean -xfd pip uninstall executorch -./install_requirements.sh --clean -./install_requirements.sh --pybind xnnpack +./install_executorch.sh --clean +./install_executorch.sh --pybind xnnpack ``` - If you encounter `pthread` related issues during link time, add `pthread` in `target_link_libraries` in `CMakeLists.txt` - On Mac, if there is linking error in Step 4 with error message like diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md index ad2f3f3dc9..d0dc71c0a8 100644 --- a/examples/models/llava/README.md +++ b/examples/models/llava/README.md @@ -37,7 +37,7 @@ application to test things out on device. Run the following command to generate `llava.pte`, `tokenizer.bin` and an image tensor (serialized in TorchScript) `image.pt`. -Prerequisite: run `install_requirements.sh` to install ExecuTorch and run +Prerequisite: run `install_executorch.sh` to install ExecuTorch and run `examples/models/llava/install_requirements.sh` to install dependencies. ```bash diff --git a/examples/models/phi-3-mini-lora/README.md b/examples/models/phi-3-mini-lora/README.md index 8e4b242807..2b7cc0ba40 100644 --- a/examples/models/phi-3-mini-lora/README.md +++ b/examples/models/phi-3-mini-lora/README.md @@ -5,7 +5,7 @@ To see how you can use the model exported for training in a fully involved finet ## Instructions ### Step 1: [Optional] Install ExecuTorch dependencies -`./install_requirements.sh` in ExecuTorch root directory. +`./install_executorch.sh` in ExecuTorch root directory. ### Step 2: Install Requirements - `./examples/models/phi-3-mini-lora/install_requirements.sh` @@ -19,7 +19,7 @@ python export_model.py 2. Run the inference model using an example runtime. For more detailed steps on this, check out [Build & Run](https://pytorch.org/executorch/stable/getting-started-setup.html#build-run). ``` # Clean and configure the CMake build system. Compiled programs will appear in the executorch/cmake-out directory we create here. -./install_requirements.sh --clean +./install_executorch.sh --clean (mkdir cmake-out && cd cmake-out && cmake ..) # Build the executor_runner target diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md index e5a86c4177..ba878d42a3 100644 --- a/examples/models/phi-3-mini/README.md +++ b/examples/models/phi-3-mini/README.md @@ -3,7 +3,7 @@ This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/micro # Instructions ## Step 1: Setup -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack` +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh --pybind xnnpack` 2. Currently, we support transformers v4.44.2. Install transformers with the following command: ``` pip uninstall -y transformers ; pip install transformers==4.44.2 diff --git a/examples/portable/README.md b/examples/portable/README.md index e469df1510..a6658197da 100644 --- a/examples/portable/README.md +++ b/examples/portable/README.md @@ -45,7 +45,7 @@ Use `-h` (or `--help`) to see all the supported models. ```bash # Build the tool from the top-level `executorch` directory. -./install_requirements.sh --clean +./install_executorch.sh --clean (mkdir cmake-out \ && cd cmake-out \ && cmake -DEXECUTORCH_PAL_DEFAULT=posix ..) \ diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt index a8e16bb5c9..55969f937e 100644 --- a/examples/qualcomm/CMakeLists.txt +++ b/examples/qualcomm/CMakeLists.txt @@ -84,11 +84,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) # build qnn_executor_runner add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/executor_runner) -# build qnn_llama_runner for llama2 -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2) - -# build qnn_llama_runner for llama3.2 -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama3_2) +# build qnn_llama_runner for llama +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama) # build qaihub_llama2_7b_runner and qaihub_llama3_8b_runner add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama) diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md index ae953be773..64a3cb28f1 100644 --- a/examples/qualcomm/README.md +++ b/examples/qualcomm/README.md @@ -4,10 +4,10 @@ This directory contains examples for some AI models. We have seperated the example scripts into the following subfolders, please refer to [README.md](../../backends/qualcomm/README.md) for the example scripts' directory structure: -1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script. +1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama](./oss_scripts/llama/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script. 2. oss_scripts: OSS stands for Open Source Software. This folder contains python scripts for open source models. Some models under this folder might also have their own customized runner. - For example, [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model. + For example, [llama](./oss_scripts/llama/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model. 3. qaihub_scripts: QAIHub stands for [Qualcomm AI Hub](https://aihub.qualcomm.com/). On QAIHub, users can find pre-compiled context binaries, a format used by QNN to save its models. This provides users with a new option for model deployment. Different from oss_scripts & scripts, which the example scripts are converting a model from nn.Module to ExecuTorch .pte files, qaihub_scripts provides example scripts for converting pre-compiled context binaries to ExecuTorch .pte files. Additionaly, users can find customized example runners specific to the QAIHub models for execution. For example [qaihub_llama2_7b](./qaihub_scripts/llama2/qaihub_llama2_7b.py) is a script converting context binaries to ExecuTorch .pte files, and [qaihub_llama2_7b_runner](./qaihub_scripts/llama2/qaihub_llama2_7b_runner.cpp) is a customized example runner to execute llama2 .pte files. Please be aware that context-binaries downloaded from QAIHub are tied to a specific QNN SDK version. Before executing the scripts and runner, please ensure that you are using the QNN SDK version that is matching the context binary. Tutorial below will also cover how to check the QNN Version for a context binary. diff --git a/examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt similarity index 65% rename from examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt rename to examples/qualcomm/oss_scripts/llama/CMakeLists.txt index 93b35a697c..c92711d9eb 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt @@ -18,38 +18,35 @@ target_link_libraries( ) target_link_options_shared_lib(custom_ops) -# preprocess qnn runner src files for llama3.2 -set(_llama3_2_runner__srcs ${_llama_runner__srcs}) -list(TRANSFORM _llama3_2_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") -list(FILTER _llama3_2_runner__srcs EXCLUDE REGEX ".*(/runner/).*") +# preprocess qnn runner src files for llama +set(_llama_runner__srcs ${_llama_runner__srcs}) +list(TRANSFORM _llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") +list(FILTER _llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*") list( PREPEND - _llama3_2_runner__srcs - ${CMAKE_CURRENT_LIST_DIR}/qnn_llama3_2_runner.cpp + _llama_runner__srcs + ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h ) -list( - APPEND _llama3_2_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp -) list( APPEND - _llama3_2_runner__srcs + _llama_runner__srcs + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama/tokenizer/llama_tiktoken.cpp ) -# build qnn llama3.2 1b runner -add_executable(qnn_llama3_2_runner ${_llama3_2_runner__srcs}) +# build qnn llama runner +add_executable(qnn_llama_runner ${_llama_runner__srcs}) target_include_directories( - qnn_llama3_2_runner PUBLIC ${_common_include_directories} + qnn_llama_runner PUBLIC ${_common_include_directories} ) target_link_libraries( - qnn_llama3_2_runner + qnn_llama_runner qnn_executorch_backend executorch_core extension_data_loader @@ -60,8 +57,8 @@ target_link_libraries( custom_ops ) target_compile_options( - qnn_llama3_2_runner PUBLIC ${_common_compile_options} + qnn_llama_runner PUBLIC ${_common_compile_options} ) set_target_properties( - qnn_llama3_2_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" + qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" ) diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md new file mode 100644 index 0000000000..79c20180d6 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -0,0 +1,70 @@ +# Summary + +## Overview +This file provides you the instructions to run LLAMA model with different parameters via Qualcomm HTP backend. We currently support the following models: + 1. LLAMA2 Stories 110M + 2. LLAMA3.2 1B + 3. LLAMA3.2 3B (WIP) +We offer the following modes to execute the model: + +Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for generating the initial sequence of tokens (usually the user's prompt). + +KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt. + +Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens. + + +## Instructions +### Note +1. For hybrid mode, the export time will be longer and can take up to 1-4 hours to complete, depending on the specific model users are exporting. +2. When exporting a hybrid mode model, memory consumption will be higher. Taking LLAMA3.2 1B as an example, please ensure the device has at least 80 GB of memory and swap space. + + +### Step 1: Setup +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. +2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. + +### Step 2: Prepare Model + +#### LLAMA2 +Download and prepare stories110M model + +```bash +# tokenizer.model & stories110M.pt: +wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" +wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" + +# tokenizer.bin: +python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin + +# params.json: +echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json +``` + +#### LLAMA3.2 +Follow the [instructions](https://www.llama.com/) to download models. +At the end of this step, users should have the following files ready: `consolidated.00.pth`, `params.json`, and `tokenizer.model`. + + +### Step3: Run default examples using hybrid mode. +#### LLAMA2 +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M.pt --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --llama_model stories110m --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "Once upon a time" +``` + +#### LLAMA3.2 +Default example using hybrid mode. +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" +``` + +### Additional Configs when running the script +If you would like to compile the model only, we have provided the flag `--compile_only`. Taking LLAMA3.2 as an example: +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --compile_only +``` + +On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model. Taking LLAMA3.2 as an example: +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} +``` \ No newline at end of file diff --git a/examples/qualcomm/oss_scripts/llama3_2/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS similarity index 93% rename from examples/qualcomm/oss_scripts/llama3_2/TARGETS rename to examples/qualcomm/oss_scripts/llama/TARGETS index cab2076f8d..02bf18075c 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/TARGETS +++ b/examples/qualcomm/oss_scripts/llama/TARGETS @@ -8,12 +8,12 @@ oncall("executorch") python_binary( name = "llama", srcs = ["llama.py"], - main_function = "executorch.examples.qualcomm.oss_scripts.llama3_2.llama.main", + main_function = "executorch.examples.qualcomm.oss_scripts.llama.llama.main", preload_deps = [ "//executorch/extension/llm/custom_ops:model_sharding_py", ], deps = [ - "//executorch/examples/qualcomm/oss_scripts/llama2:static_llama", + "//executorch/examples/qualcomm/oss_scripts/llama:static_llama", "//caffe2:torch", "//executorch/extension/pybindings:aten_lib", "//executorch/backends/qualcomm/partition:partition", diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py similarity index 84% rename from examples/qualcomm/oss_scripts/llama3_2/llama.py rename to examples/qualcomm/oss_scripts/llama/llama.py index a18690e941..0af0f55b88 100755 --- a/examples/qualcomm/oss_scripts/llama3_2/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -22,7 +22,9 @@ from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner from executorch.backends.qualcomm.quantizer.custom_annotation import ( + annotate_linear_16a8w_in_affine_layer, annotate_matmul_16a8w, + annotate_prefill_kv_output, ) from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype @@ -38,7 +40,8 @@ get_soc_to_chipset_map, update_spill_fill_size, ) -from executorch.examples.qualcomm.oss_scripts.llama2.model.static_llama import ( +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken +from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import ( LlamaModel, ModelArgs, ) @@ -55,6 +58,9 @@ from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass from executorch.extension.llm.custom_ops import model_sharding from executorch.extension.llm.export.builder import DType +from executorch.extension.llm.tokenizer.tokenizer import ( + Tokenizer as SentencePieceTokenizer, +) from executorch.extension.llm.tokenizer.utils import get_tokenizer from torch.ao.quantization.observer import MinMaxObserver @@ -70,21 +76,28 @@ def _kv_calibrate( example_inputs, user_prompts, module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", + tokenizer, max_seq_len=512, ): - sp_model = get_tokenizer(tokenizer_model_path) _, atten_mask, _, k_caches, v_caches = example_inputs # TODO: change criteria & support batch inputs if necessary pos = torch.tensor(0, dtype=torch.int32) max_cache_len = max_seq_len - 1 - token_list = sp_model.encode( - user_prompts, bos=True, eos=False, allowed_special="all" - ) + + token_list = [] + # Llama2 tokenizer has no special tokens + if isinstance(tokenizer, SentencePieceTokenizer): + token_list = tokenizer.encode(user_prompts, bos=True, eos=False) + elif isinstance(tokenizer, Tiktoken): + token_list = tokenizer.encode( + user_prompts, bos=True, eos=False, allowed_special="all" + ) + else: + raise RuntimeError("Unkown tokenizer") with torch.no_grad(): - while token_list[-1] != sp_model.eos_id and pos < max_cache_len: + while token_list[-1] != tokenizer.eos_id and pos < max_cache_len: logits, new_k_caches, new_v_caches = module( torch.full((1, 1), token_list[pos], dtype=torch.int32), atten_mask, @@ -106,28 +119,36 @@ def _kv_calibrate( if pos >= len(token_list): token_list.append(torch.argmax(logits[:, -1], dim=-1).item()) - print(f"calibration data:\n{sp_model.decode(token_list)}") + print(f"kv calibration data:\n{tokenizer.decode(token_list)}") def _prefill_calibrate( example_inputs, user_prompts, module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", + tokenizer, max_seq_len=512, ): - sp_model = get_tokenizer(tokenizer_model_path) _, atten_mask = example_inputs max_cache_len = max_seq_len - 1 # TODO: change criteria & support batch inputs if necessary - token_list = sp_model.encode( - user_prompts, bos=True, eos=False, allowed_special="all" - ) + + token_list = [] + # Llama2 tokenizer has no special tokens + if isinstance(tokenizer, SentencePieceTokenizer): + token_list = tokenizer.encode(user_prompts, bos=True, eos=False) + elif isinstance(tokenizer, Tiktoken): + token_list = tokenizer.encode( + user_prompts, bos=True, eos=False, allowed_special="all" + ) + else: + raise RuntimeError("Unkown tokenizer") + pos = len(token_list) with torch.no_grad(): - while token_list[-1] != sp_model.eos_id and pos < max_cache_len: + while token_list[-1] != tokenizer.eos_id and pos < max_cache_len: tmp_token_list = torch.tensor(token_list).reshape(1, -1) if pos < max_cache_len: tmp_token_list = torch.cat( @@ -144,14 +165,14 @@ def _prefill_calibrate( token_list.append(torch.argmax(logits[:, pos - 1], dim=-1).item()) pos += 1 - print(f"calibration data:\n{sp_model.decode(token_list)}") + print(f"prefill calibration data:\n{tokenizer.decode(token_list)}") def calibrate( example_inputs, user_prompts, module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", + tokenizer, max_seq_len=512, ): if len(example_inputs) == 2: @@ -159,7 +180,7 @@ def calibrate( example_inputs, user_prompts, module, - tokenizer_model_path, + tokenizer, max_seq_len, ) elif len(example_inputs) == 5: @@ -167,7 +188,7 @@ def calibrate( example_inputs, user_prompts, module, - tokenizer_model_path, + tokenizer, max_seq_len, ) else: @@ -280,7 +301,7 @@ def _tag_ios(self, gm: torch.fx.GraphModule, fixed_point_type): return quant_attrs - def quantize(self, quant_dtype, args, custom_annotations=()): + def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()): self.quant_dtype = quant_dtype quantizer = make_quantizer( quant_dtype=quant_dtype, @@ -303,7 +324,7 @@ def quantize(self, quant_dtype, args, custom_annotations=()): self.get_example_inputs(self.llama_meta["get_use_kv_cache"]), args.prompt, fx_graph_module, - tokenizer_model_path=args.tokenizer_model, + tokenizer=tokenizer, max_seq_len=self.llama_meta["get_max_seq_len"], ) @@ -366,7 +387,7 @@ def lowering_modules( if num_sharding > 0: update_spill_fill_size(edge_prog_mgr.exported_program()) exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config) - with open(f"{work_space}/{pte_filename}.pte", "wb") as file: + with open(f"{work_space}/{self.pte_filename}.pte", "wb") as file: exec_prog_mgr.write_to_file(file) def get_example_inputs(self, use_kv_cache=True): @@ -376,7 +397,7 @@ def get_quant_attrs(self): return self.quant_attrs -def compile(args, pte_filename): +def compile(args, pte_filename, tokenizer): os.makedirs(args.artifact, exist_ok=True) start_ts = time.time() @@ -407,13 +428,13 @@ def compile(args, pte_filename): ) elif args.model_mode == "hybrid": llama_instance_list.append( - LlamaModel(prefill_config, output_new_cache_only=False) + LlamaModel(kv_config, output_new_cache_only=True) ) llama_instance_list.append( - LlamaModel(kv_config, output_new_cache_only=True) + LlamaModel(prefill_config, output_new_cache_only=False) ) else: - raise RuntimeError(f"No such model_mode {args.model_mode}.") + raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") if "model" in state_dict: state_dict = state_dict["model"] @@ -467,17 +488,35 @@ def compile(args, pte_filename): if args.ptq: start_quantize_ts = time.time() - for llama_instance in llama_instance_list: - llama_instance.quantize( - quant_dtype=quant_dtype, - args=args, - custom_annotations=( - partial( - annotate_matmul_16a8w, - traverse_input1=llama_instance.llama_meta["get_use_kv_cache"], - ), - ), + custom_annotations = (annotate_matmul_16a8w,) + if args.llama_model == "stories110m": + custom_annotations = custom_annotations + ( + annotate_linear_16a8w_in_affine_layer, ) + if args.ptq != None: + kv_quant_attrs = {} + for i, llama_instance in enumerate(llama_instance_list): + llama_instance.quantize( + quant_dtype=quant_dtype, + args=args, + tokenizer=tokenizer, + custom_annotations=custom_annotations, + ) + # If hybrid mode, we store kv output quant_attrs and apply to prefill output quant_attrs later + if i == 0 and args.model_mode == "hybrid": + output_indices = 0 + for node in llama_instance.llama_model.graph.nodes: + if node.op == "output": + for output in node.args[0]: + kv_quant_attrs[output_indices] = output.args[1:] + output_indices += 1 + break + custom_annotations = custom_annotations + ( + partial( + annotate_prefill_kv_output, + kv_quant_attrs=kv_quant_attrs, + ), + ) end_quantize_ts = time.time() logging.info(f"Time for quantizing: {end_quantize_ts - start_quantize_ts}") @@ -520,7 +559,7 @@ def compile(args, pte_filename): backend_options = generate_htp_compiler_spec( use_fp16=use_fp16, use_multi_contexts=args.num_sharding > 0 ) - graph_names = ["prefill_forward", "kv_forward"] + graph_names = ["kv_forward", "prefill_forward"] compiler_specs = [ generate_qnn_executorch_compiler_spec( soc_model=get_soc_to_chipset_map()[args.model], @@ -633,7 +672,7 @@ def compile(args, pte_filename): call_delegate_inputs_dict=call_delegate_inputs_dict, outputs_dict=outputs_dict, backend_config=executorch_config, - constant_methods=llama_instance_list[1].llama_meta, # kv method meta + constant_methods=llama_instance_list[0].llama_meta, # kv method meta ) with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file: exec_prog.write_to_file(file) @@ -662,7 +701,7 @@ def compile(args, pte_filename): input_nodes_dict=input_nodes_dict, output_nodes_dict=output_nodes_dict, backend_config=executorch_config, - constant_methods=llama_instance_list[1].llama_meta, # kv method meta + constant_methods=llama_instance_list[0].llama_meta, # kv method meta ) with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file: prog_mgr.write_to_file(file) @@ -672,7 +711,7 @@ def compile(args, pte_filename): return quant_attrs -def inference(args, quant_attrs, pte_filename, pre_gen_pte=""): +def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_pte=""): workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama" if args.model_mode == "prefill": @@ -682,14 +721,14 @@ def inference(args, quant_attrs, pte_filename, pre_gen_pte=""): elif args.model_mode == "hybrid": eval_mode = 2 else: - raise RuntimeError(f"No such model_mode {args.model_mode}.") + raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len runner_args = " ".join( [ f"--model_path {pte_filename}.pte", "--output_path outputs/outputs.txt", - f"--tokenizer_path {os.path.basename(args.tokenizer_model)}", + f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}", f'--prompt "{args.prompt}"', f"--seq_len {seq_len}", f"--eval_mode {eval_mode}", @@ -702,7 +741,7 @@ def inference(args, quant_attrs, pte_filename, pre_gen_pte=""): runner_cmd = " ".join( [ f"cd {workspace} &&", - f"./qnn_llama3_2_runner {runner_args}", + f"./qnn_llama_runner {runner_args}", ] ) @@ -720,10 +759,10 @@ def inference(args, quant_attrs, pte_filename, pre_gen_pte=""): host_id=args.host, soc_model=args.model, shared_buffer=args.shared_buffer, - runner=f"examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner", + runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner", ) # No pregen inputs, input_list is not required - adb.push(inputs=[], input_list="", files=[args.tokenizer_model]) + adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path]) adb.execute(custom_runner_cmd=runner_cmd) # collect output data @@ -756,8 +795,8 @@ def main(): parser.add_argument( "-a", "--artifact", - help="path for storing generated artifacts and output by this example. Default ./llama3_2_qnn", - default="./llama3_2_qnn", + help="path for storing generated artifacts and output by this example. Default ./llama_qnn", + default="./llama_qnn", type=str, ) @@ -768,6 +807,13 @@ def main(): type=str, ) + parser.add_argument( + "--llama_model", + choices=["stories110m", "llama3_2"], + help="The Llama model to export. Current available options are: [stories110m, llama3_2]", + required=True, + ) + parser.add_argument( "--checkpoint", help="Pass llama checkpoint.", @@ -783,10 +829,9 @@ def main(): ) parser.add_argument( - "--model_size", - help="Determine what runner be used. For llama 3.2, we only support 1B/3B. ", - choices=["1B", "3B"], - required=True, + "--tokenizer_bin", + help="For Llama2. Pass Llama2 tokenizer binary.", + required=False, type=str, ) @@ -806,7 +851,7 @@ def main(): parser.add_argument( "--system_prompt", - help="Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None", + help="For Llama3. Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None", default="", type=str, ) @@ -829,7 +874,7 @@ def main(): parser.add_argument( "--pre_gen_pte", - help="Run the Pre-generated llama in the given directory", + help="Run the pre-generated llama in the given directory.", type=str, ) @@ -867,26 +912,46 @@ def main(): exit("Cannot set both compile_only and pre_gen_pte as true") if args.model_mode == "kv": - pte_filename = "kv_llama3_2_qnn" + pte_filename = "kv_llama_qnn" elif args.model_mode == "prefill": - pte_filename = "prefill_llama3_2_qnn" + pte_filename = "prefill_llama_qnn" elif args.model_mode == "hybrid": assert ( args.kv_seq_len >= args.prefill_seq_len ), "Please ensure kv_seq_len is >= prefill_seq_len" - pte_filename = "hybrid_llama3_2_qnn" + pte_filename = "hybrid_llama_qnn" else: - raise RuntimeError(f"No such model_mode {args.model_mode}.") + raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") + + tokenizer = get_tokenizer(args.tokenizer_model) + runtime_tokenizer_path = "" + if args.llama_model == "stories110m": + assert isinstance( + tokenizer, SentencePieceTokenizer + ), f"Wrong tokenizer provided for stories110m." + assert ( + args.tokenizer_bin is not None + ), "Please provide tokenizer_bin for stories110m." + runtime_tokenizer_path = args.tokenizer_bin + elif args.llama_model == "llama3_2": + assert isinstance( + tokenizer, Tiktoken + ), f"Wrong tokenizer provided for llama3_2." + runtime_tokenizer_path = args.tokenizer_model + else: + raise RuntimeError(f"Unknown llama_model: {args.llama_model}.") if args.pre_gen_pte: quant_attrs = json.load( open(f"{args.pre_gen_pte}/{pte_filename}_quant_attrs.txt") ) - inference(args, quant_attrs, pte_filename, args.pre_gen_pte) + inference( + args, quant_attrs, pte_filename, runtime_tokenizer_path, args.pre_gen_pte + ) exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") if args.compile_only: - quant_attrs = compile(args, pte_filename) + quant_attrs = compile(args, pte_filename, tokenizer) if quant_attrs: json.dump( { @@ -900,7 +965,7 @@ def main(): exit(f"Finish compile_only and save to {args.artifact}") try: - quant_attrs = compile(args, pte_filename) + quant_attrs = compile(args, pte_filename, tokenizer) if quant_attrs: logging.info( f"Logit scale: {quant_attrs['scale']}; Logit offset: {quant_attrs['zero_point']}" @@ -914,7 +979,7 @@ def main(): ) else: logging.warning("Quant attributes of the logit is None.") - inference(args, quant_attrs, pte_filename) + inference(args, quant_attrs, pte_filename, runtime_tokenizer_path) except Exception as e: if args.ip and args.port != -1: with Client((args.ip, args.port)) as conn: diff --git a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py similarity index 100% rename from examples/qualcomm/oss_scripts/llama2/model/static_llama.py rename to examples/qualcomm/oss_scripts/llama/model/static_llama.py diff --git a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp similarity index 85% rename from examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp rename to examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp index 2af882580e..7660952ef0 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp @@ -9,12 +9,13 @@ /** * @file * - * This tool can run Llama3.2 1B/3B with Qualcomm AI Engine Direct. + * This tool can run Llama2 110M, Llama3.2 1B / 3B(WIP) with Qualcomm AI Engine + * Direct. * */ #include -#include +#include #include #include #include @@ -22,7 +23,7 @@ DEFINE_string( model_path, - "qnn_llama2.pte", + "kv_llama_qnn.pte", "Model serialized in flatbuffer format."); DEFINE_string( @@ -42,11 +43,11 @@ DEFINE_double( DEFINE_int32( seq_len, 128, - "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens."); + "Total number of tokens to generate (prompt + output)."); DEFINE_int32( eval_mode, - 0, + 1, "0: PromptProcessor(prefill) / 1: TokenGenerator(kv) / 2: HybridMode (prefill+kv)"); DEFINE_double(logits_scale, 0.0, "Logits scale"); DEFINE_int32(logits_offset, 0, "Logits offset"); diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_memory.cpp similarity index 99% rename from examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp rename to examples/qualcomm/oss_scripts/llama/runner/io_memory.cpp index 941ff97685..22efd5a334 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/io_memory.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include #include @@ -403,7 +403,7 @@ void HybridMemory::update_prefill_to_kv_io( // If prompt len is 30, prefill will handle to pos = 30. // At this point, pos should be 31. for (int i = 0; i < pos + 1; i++) { - ptr->kv_attention_mask[kv_cache_len_ - i] = 0; + ptr->kv_attention_mask[kv_cache_len_ - i] = 65535; } // update v_cache diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h b/examples/qualcomm/oss_scripts/llama/runner/io_memory.h similarity index 100% rename from examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h rename to examples/qualcomm/oss_scripts/llama/runner/io_memory.h diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp similarity index 94% rename from examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp rename to examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 02a53861b8..e06d52fbb3 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -10,7 +10,7 @@ // logic. The module takes in a string as input and emits a string as output. #include -#include +#include #include #include #include @@ -57,7 +57,7 @@ Runner::Runner( ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str()); } ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str()); - ET_LOG(Info, "eval mode=%d", eval_mode); + ET_LOG(Info, "eval mode=%d", eval_mode_); } bool Runner::is_loaded() const { @@ -168,12 +168,14 @@ Error Runner::load() { // llama2 tokenizer tokenizer_ = std::make_unique(); err = tokenizer_->load(tokenizer_path_); + llama_version_ = LlamaVersion::kLlama2; ET_CHECK_MSG( err == Error::Ok, "failed to load tokenizer %s", tokenizer_path_.c_str()); } else { eos_id_.insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]); + llama_version_ = LlamaVersion::kLlama3; } bos_id_ = tokenizer_->bos_tok(); eos_id_.insert(tokenizer_->eos_tok()); @@ -217,8 +219,7 @@ int32_t Runner::logitsToToken(const Tensor& logits_tensor, int64_t pos) { // offset to the meaningful logit we want. if (logits_tensor.sizes().data()[1] > 1) { - auto vocab_size = logits_tensor.size(2); - logits_last += pos * vocab_size; + logits_last += pos * vocab_size_; } // dequantize @@ -277,17 +278,27 @@ Error Runner::generate( ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null"); - if (!system_prompt.empty()) { - prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n"); - prompt_.append(system_prompt); - prompt_.append("<|eot_id|>"); - } - prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n"); - prompt_.append(prompt); - prompt_.append("<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); - - if (token_callback) { - token_callback("<|begin_of_text|>"); + switch (llama_version_) { + case LlamaVersion::kLlama2: + prompt_.append(prompt); + break; + case LlamaVersion::kLlama3: + if (!system_prompt.empty()) { + prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n"); + prompt_.append(system_prompt); + prompt_.append("<|eot_id|>"); + } + prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n"); + prompt_.append(prompt); + prompt_.append( + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); + if (token_callback) { + token_callback("<|begin_of_text|>"); + } + break; + default: + ET_CHECK_MSG(false, "unsupported llama version"); + break; } int max_seq_len = std::max(prefill_cache_len_, kv_cache_len_) + 1; @@ -318,14 +329,14 @@ Error Runner::generate( int64_t pos = 0, prev_token, cur_token = prompt_tokens[0]; HybridMemory::IO* ptr = static_cast(io_mem_->get_mutable_ptr()); + if (token_callback) { + token_callback(prompt_); + } auto prefill_execute = [&](const std::string& method_name) { for (int i = 0; i < num_prompt_tokens; i++) { ptr->prefill_input_toks[i] = static_cast(prompt_tokens[i]); } - if (token_callback) { - token_callback(prompt_); - } pos = num_prompt_tokens - 1; cur_token = prompt_tokens[pos]; @@ -389,7 +400,7 @@ Error Runner::generate( auto piece_res = tokenizer_->decode(prev_token, cur_token); ET_CHECK(piece_res.ok()); - if (token_callback) { + if (token_callback && pos >= num_prompt_tokens) { token_callback(piece_res.get().c_str()); } diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h similarity index 95% rename from examples/qualcomm/oss_scripts/llama3_2/runner/runner.h rename to examples/qualcomm/oss_scripts/llama/runner/runner.h index 75ad640219..aaf79360bd 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -73,6 +73,10 @@ class Runner { get_methods_meta(std::string& method_name); private: + enum LlamaVersion { + kLlama2 = 0, + kLlama3, + }; template T getMetadataHelper(std::string method_name, T default_val); int32_t logitsToToken( @@ -104,6 +108,7 @@ class Runner { std::string prefill_forward_name_; std::string kv_forward_name_; std::vector method_names_; + LlamaVersion llama_version_; }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama3_2/targets.bzl b/examples/qualcomm/oss_scripts/llama/targets.bzl similarity index 95% rename from examples/qualcomm/oss_scripts/llama3_2/targets.bzl rename to examples/qualcomm/oss_scripts/llama/targets.bzl index 64adc7eca9..9780da0369 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/targets.bzl +++ b/examples/qualcomm/oss_scripts/llama/targets.bzl @@ -36,9 +36,9 @@ def define_common_targets(): ) runtime.cxx_binary( - name = "qnn_llama3_2_runner", + name = "qnn_llama_runner", srcs = [ - "qnn_llama3_2_runner.cpp", + "qnn_llama_runner.cpp", ], compiler_flags = [ "-Wno-global-constructors", diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt deleted file mode 100644 index 61a2ecda56..0000000000 --- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set(_qnn_llama_runner__srcs ${_llama_runner__srcs}) - -# preprocess qnn llama runner src files -list(TRANSFORM _qnn_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") -list(FILTER _qnn_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*") -list( - PREPEND - _qnn_llama_runner__srcs - ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h -) - -# build qnn llama runner -add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs}) -target_include_directories( - qnn_llama_runner PUBLIC ${_common_include_directories} -) -target_link_libraries( - qnn_llama_runner - qnn_executorch_backend - full_portable_ops_lib - extension_data_loader - extension_module - extension_tensor - gflags - re2::re2 -) -target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options}) -set_target_properties( - qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" -) diff --git a/examples/qualcomm/oss_scripts/llama2/README.md b/examples/qualcomm/oss_scripts/llama2/README.md deleted file mode 100644 index d83902a6de..0000000000 --- a/examples/qualcomm/oss_scripts/llama2/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Summary - -## Overview -This file provides you the instructions to run LLAMA2 with different parameters via Qualcomm HTP backend. Following settings support for Stories 110M - -Please check corresponding section for more information. - -## Stories 110M -This example demonstrates how to run a smaller LLAMA2, stories110M on mobile via Qualcomm HTP backend. Model architecture is fine-tuned specifically for HTP to accelerate the performance. Weight is quantized via PTQ quantization to fit the model on a phone. - -### Instructions -#### Step 1: Setup -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. -2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. - -#### Step2: Prepare Model -Download and preapre stories110M model - -```bash -# tokenizer.model & stories110M.pt: -wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" -wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" - -# tokenizer.bin: -python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin - -# params.json: -echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json -``` - -#### Step3: Run default examples -Default example generates the story based on the given prompt, "Once". -```bash -# 16a4w quant: -python examples/qualcomm/oss_scripts/llama2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once" -``` - -#### (Note) Customized PTQ data set -User prompts are used for PTQ calibration data. Take the examples above, the word "Once" is the only word for PTQ. If you want to observe more data during the calibration time. Please add more prompts to the args `--prompt`. \ No newline at end of file diff --git a/examples/qualcomm/oss_scripts/llama2/TARGETS b/examples/qualcomm/oss_scripts/llama2/TARGETS deleted file mode 100644 index b0f5ea7f64..0000000000 --- a/examples/qualcomm/oss_scripts/llama2/TARGETS +++ /dev/null @@ -1,43 +0,0 @@ -load("@fbcode_macros//build_defs:python_library.bzl", "python_library") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") -load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary") -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -oncall("executorch") - - -python_library( - name = "static_llama", - srcs = [ - "model/static_llama.py", - ], - deps = [ - "//caffe2:torch", - ], -) - -python_binary( - name = "llama", - srcs = ["llama.py"], - main_function = "executorch.examples.qualcomm.oss_scripts.llama2.llama.main", - deps = [ - ":static_llama", - "//caffe2:torch", - "//executorch/extension/pybindings:aten_lib", - "//executorch/backends/qualcomm/partition:partition", - "//executorch/backends/qualcomm/quantizer:quantizer", - "//executorch/devtools:lib", - "//executorch/examples/models:models", - "//executorch/examples/qualcomm:utils", - "//executorch/extension/export_util:export_util", - "//executorch/extension/llm/export:export_lib", - ], -) - -runtime.command_alias( - name = "llama_qnn", - env = { - "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()), - }, - exe = ":llama", -) diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py deleted file mode 100755 index 2a2968362a..0000000000 --- a/examples/qualcomm/oss_scripts/llama2/llama.py +++ /dev/null @@ -1,690 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# TODO: reenable pyre after fixing the issues -# pyre-ignore-all-errors - -import codecs -import getpass -import json -import os -import time -from multiprocessing.connection import Client - -import torch -from executorch.backends.qualcomm._passes.build_quant_io import BuildQuantIo - -from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner - -from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype -from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset -from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO -from executorch.backends.qualcomm.utils.utils import ( - capture_program, - convert_linear_to_conv2d, - generate_htp_compiler_spec, - generate_qnn_executorch_compiler_spec, - get_soc_to_chipset_map, -) -from executorch.examples.qualcomm.oss_scripts.llama2.model.static_llama import ( - LlamaModel, - ModelArgs, -) -from executorch.examples.qualcomm.utils import ( - make_output_dir, - make_quantizer, - setup_common_args_and_variables, - SimpleADB, -) -from executorch.exir import EdgeCompileConfig, EdgeProgramManager -from executorch.exir.capture._config import ExecutorchBackendConfig -from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass -from executorch.extension.llm.export.builder import DType - -from sentencepiece import SentencePieceProcessor -from torch.ao.quantization.observer import MinMaxObserver -from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e - - -pte_filename = "llama2_qnn" - - -def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: - """ - This function is specific for matmul op 16a8w. - """ - - from executorch.backends.qualcomm.quantizer.annotators import QUANT_ANNOTATION_KEY - from executorch.backends.qualcomm.quantizer.quantizer import ( - get_16a8w_qnn_ptq_config, - get_8a8w_qnn_ptq_config, - QuantizationConfig, - ) - from torch.ao.quantization.quantizer import ( - QuantizationAnnotation, - SharedQuantizationSpec, - ) - from torch.fx import Node - - def annotate_matmul(node: Node, quantization_config: QuantizationConfig): - input_qspec_map = {} - input_act = node.args[0] - input_spec = quantization_config.input_activation - input_qspec_map[input_act] = input_spec - - input_act1 = node.args[1] - input_spec1 = quantization_config.weight - input_qspec_map[input_act1] = input_spec1 - - node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=quantization_config.output_activation, - _annotated=True, - ) - - def annotate_cat(node: Node, quantization_config: QuantizationConfig): - input_nodes = node.args[0] - - first_input_node = input_nodes[0] - input_qspec_map = {} - input_qspec_map[first_input_node] = quantization_config.input_activation - share_qparams_with_input_act0_qspec = SharedQuantizationSpec( - (first_input_node, node) - ) - - for input_node in input_nodes[1:]: - if input_node not in input_qspec_map: - input_qspec_map[input_node] = share_qparams_with_input_act0_qspec - - node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=share_qparams_with_input_act0_qspec, - _annotated=True, - ) - - def annotate_single_in_single_out( - node: Node, quantization_config: QuantizationConfig - ) -> None: - input_qspec_map = {} - input_act = node.args[0] - input_qspec_map[input_act] = quantization_config.input_activation - - node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=quantization_config.output_activation, - _annotated=True, - ) - - def annotate_matmul_input1(node: Node): - quantization_config_8a8w = get_8a8w_qnn_ptq_config(act_symmetric=True) - while isinstance(node, Node) and node.op == "call_function": - if node.target in [ - torch.ops.aten.permute.default, - torch.ops.aten.transpose.int, - ]: - annotate_single_in_single_out(node, quantization_config_8a8w) - node = node.args[0] - elif node.target == torch.ops.aten.cat.default: - annotate_cat(node, quantization_config_8a8w) - node = node.args[0][0] - else: - node = node.args[0] - - quantization_config_16a8w = get_16a8w_qnn_ptq_config() - - for node in gm.graph.nodes: - if node.op == "call_function" and node.target == torch.ops.aten.matmul.default: - annotate_matmul(node, quantization_config_16a8w) - annotate_matmul_input1(node.args[1]) - - -def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None: - from executorch.backends.qualcomm.quantizer.annotators import QUANT_ANNOTATION_KEY - from executorch.backends.qualcomm.quantizer.quantizer import ( - get_ptq_per_channel_quant_config, - QuantizationConfig, - ) - from torch.ao.quantization.quantizer import QuantizationAnnotation - from torch.fx import Node - - def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None: - input_qspec_map = {} - input_act = node.args[0] - input_spec = quantization_config.input_activation - input_qspec_map[input_act] = input_spec - - weight = node.args[1] - input_qspec_map[weight] = quantization_config.weight - - node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=quantization_config.output_activation, - _annotated=True, - ) - - quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config( - torch.uint16, weight_dtype=torch.int8 - ) - for node in gm.graph.nodes: - if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default: - if "nn_module_stack" in node.meta: - module_values_list = list(node.meta["nn_module_stack"].values()) - full_qualified_name = module_values_list[0][0] - if full_qualified_name == "L['self'].llama.output": - annotate_conv2d( - node, quantization_config=quantization_config_16a8w_per_channel - ) - - -def _kv_calibrate( - example_inputs, - user_prompts, - module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", - max_seq_len=512, -): - sp_model = SentencePieceProcessor(model_file=tokenizer_model_path) - _, atten_mask, _, k_caches, v_caches = example_inputs - - # TODO: change criteria & support batch inputs if necessary - pos = torch.tensor(0, dtype=torch.int32) - token_list = [sp_model.bos_id()] - for prompt in user_prompts.split(): - token_list += sp_model.encode(prompt) - - def sample_top_p(probs: torch.Tensor, top_p: float) -> torch.Tensor: - probs_sort, probs_indices = torch.sort(probs, dim=-1, descending=True) - probs_sum = torch.cumsum(probs_sort, dim=-1) - mask = probs_sum - probs_sort > top_p - probs_sort[mask] = 0 - probs_sort /= probs_sort.sum(dim=-1, keepdim=True) - next_token = torch.multinomial(probs_sort, num_samples=1) - return probs_indices.gather(dim=-1, index=next_token) - - with torch.no_grad(): - while token_list[-1] != sp_model.eos_id() and pos < max_seq_len - 1: - logits, new_k_caches, new_v_caches = module( - torch.full((1, 1), token_list[pos]), - atten_mask, - torch.full((1, 1), pos), - *k_caches, - *v_caches, - ) - k_caches = [ - torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1) - for i, k_cache in enumerate(k_caches) - ] - v_caches = [ - torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1) - for i, v_cache in enumerate(v_caches) - ] - - pos += 1 - atten_mask[0][-pos - 1] = 0 - if pos >= len(token_list): - probs = torch.softmax(logits[:, -1] / 0.8, dim=-1) - token_list.append(sample_top_p(probs, 0.9).item()) - - print(f"calibration data:\n{sp_model.decode(token_list)}") - - -def _batch_prefill_calibrate( - example_inputs, - user_prompts, - module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", - max_seq_len=512, -): - sp_model = SentencePieceProcessor(model_file=tokenizer_model_path) - _, atten_mask = example_inputs - max_cache_len = max_seq_len - 1 - - # TODO: change criteria & support batch inputs if necessary - token_list = sp_model.encode(user_prompts, bos=True, eos=False) - token_list = torch.tensor(token_list)[:max_cache_len].reshape(1, -1) - last_prompt_pos = token_list.numel() - if last_prompt_pos < max_cache_len: - token_list = torch.cat( - [ - token_list, - torch.zeros((1, max_cache_len - last_prompt_pos), dtype=torch.int32), - ], - dim=1, - ) - else: - token_list = token_list[:, :max_cache_len] - - with torch.no_grad(): - logits, new_k_caches, new_v_caches = module( - token_list, - atten_mask, - ) - predict = [torch.argmax(logits[:, last_prompt_pos - 1], dim=-1).item()] - - print(f"calibration data:\n{sp_model.decode(predict)}") - - -def calibrate( - example_inputs, - user_prompts, - module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", - max_seq_len=512, -): - if len(example_inputs) == 2: - _batch_prefill_calibrate( - example_inputs, - user_prompts, - module, - tokenizer_model_path, - max_seq_len, - ) - elif len(example_inputs) == 5: - _kv_calibrate( - example_inputs, - user_prompts, - module, - tokenizer_model_path, - max_seq_len, - ) - else: - raise RuntimeError("Get wrong inputs") - - -class SingleLlama: - def __init__(self, llama_model) -> None: - super().__init__() - self.llama_model = llama_model - self.quant_dtype = None - self.llama_meta = self.llama_model.get_metadata() - self.has_quant_io = False - if self.llama_meta["get_use_kv_cache"]: - tokens, atten_mask, pos_ids, k_caches, v_caches = self.get_example_inputs( - use_kv_cache=True - ) - self.inputs = (tokens, atten_mask, pos_ids, *k_caches, *v_caches) - else: - tokens, atten_mask = self.get_example_inputs(use_kv_cache=False) - self.inputs = (tokens, atten_mask) - - def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type): - if not self.has_quant_io: - return - - # shape of k caches and v caches - input_cache_shape = { - (self.llama_meta["get_head_dim"], self.llama_meta["get_max_seq_len"]), - (self.llama_meta["get_max_seq_len"], self.llama_meta["get_head_dim"]), - } - for n in gm.graph.nodes: - if ( - n.op == "placeholder" - and len(users := list(n.users)) == 1 - and users[0].meta["val"].size()[-2:] in input_cache_shape - ): - n.meta[QCOM_QUANTIZED_IO] = kv_type - elif n.op == "output": - for a in n.args[0]: - # single head, kv mode - if ( - a.meta["val"].flatten().size()[0] - == self.llama_meta["get_head_dim"] - ): - a.meta[QCOM_QUANTIZED_IO] = kv_type - # single head, batch_prefill mode - elif a.meta["val"].flatten().size()[0] == self.llama_meta[ - "get_head_dim" - ] * (self.llama_meta["get_max_seq_len"] - 1): - a.meta[QCOM_QUANTIZED_IO] = kv_type - - def quantize(self, quant_dtype, args, custom_annotations=()): - self.quant_dtype = quant_dtype - quantizer = make_quantizer( - quant_dtype=quant_dtype, - per_channel_conv=True, - per_channel_linear=True, - act_observer=MinMaxObserver, - ) - quantizer.add_custom_quant_annotations(custom_annotations) - - self.has_quant_io = True - fx_graph_module = None - - with torch.no_grad(): - fx_graph_module = torch.export.export( - self.llama_model, self.inputs, strict=True - ).module() - fx_graph_module = prepare_pt2e(fx_graph_module, quantizer) - print("Quantizing the model...") - - calibrate( - self.get_example_inputs(self.llama_meta["get_use_kv_cache"]), - args.prompt, - fx_graph_module, - tokenizer_model_path=args.tokenizer_model, - max_seq_len=args.seq_len, - ) - - self.llama_model = convert_pt2e(fx_graph_module) - - def lowering_modules( - self, work_space, kv_type=torch.uint8, soc_model=QcomChipset.SM8650 - ): - executorch_config = ExecutorchBackendConfig( - passes=[ - BuildQuantIo(), - ], - # For shared buffer, user must pass the memory address - # which is allocated by RPC memory to executor runner. - # Therefore, won't want to pre-allocate - # by memory manager in runtime. - memory_planning_pass=MemoryPlanningPass( - alloc_graph_input=False, - alloc_graph_output=False, - ), - extract_delegate_segments=True, - ) - with torch.no_grad(): - # backend option - backend_options = generate_htp_compiler_spec(use_fp16=False) - compiler_specs = generate_qnn_executorch_compiler_spec( - soc_model=soc_model, - backend_options=backend_options, - shared_buffer=True, - ) - partitioner = QnnPartitioner(compiler_specs) - edge_prog = capture_program(self.llama_model, self.inputs) - self._tag_kv_ios(edge_prog.exported_program.graph_module, kv_type=kv_type) - edge_prog_mgr = EdgeProgramManager( - edge_programs={"forward": edge_prog.exported_program}, - constant_methods=self.llama_meta, - compile_config=EdgeCompileConfig(_check_ir_validity=False), - ) - edge_prog_mgr = edge_prog_mgr.to_backend(partitioner) - exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config) - with open(f"{work_space}/{pte_filename}.pte", "wb") as file: - exec_prog_mgr.write_to_file(file) - - def get_example_inputs(self, use_kv_cache=True): - return self.llama_model.get_example_inputs(use_kv_cache) - - -def compile(args): - os.makedirs(args.artifact, exist_ok=True) - start_ts = time.time() - - if args.model_mode == "kv": - use_kv_cache = output_new_cache_only = True - elif args.model_mode == "batch_prefill" or args.model_mode == "hybrid": - raise NotImplementedError( - f"model_mode {args.model_mode} is not implemented yet." - ) - else: - raise RuntimeError(f"No such model_mode {args.model_mode}.") - - with open(args.params) as f: - config = ModelArgs(**json.load(f)) - # TODO: support batch inputs if necessary - config.max_batch_size = 1 - config.max_seq_len = args.seq_len - config.use_kv_cache = use_kv_cache - state_dict = torch.load( - args.checkpoint, weights_only=True, map_location="cpu", mmap=True - ) - end_load_ts = time.time() - print("torch.load checkpoint", end_load_ts - start_ts) - - llama_instance = None - with torch.device("meta"): - llama_instance = LlamaModel(config, output_new_cache_only=output_new_cache_only) - if "model" in state_dict: - state_dict = state_dict["model"] - llama_instance.load_state_dict( - state_dict, - strict=False, - assign=True, - ) - end_load_state_dict_ts = time.time() - print("instance.load_state_dict", end_load_state_dict_ts - end_load_ts) - - for layer in llama_instance.layers: - if getattr(layer.attention, "prepare_sha", None): - layer.attention.prepare_sha() - - kv_type = torch.uint8 - assert args.ptq in [ - "8a8w", - "16a4w", - ], f"No support for quant type {args.ptq}. Support 8a8w and 16a4w." - quant_dtype = getattr(QuantDtype, f"use_{args.ptq}") - assert args.tokenizer_model is not None, "Need tokenizer model for calibration" - - if args.dtype_override is not None: - dtype_override = DType[args.dtype_override] - llama_instance = llama_instance.to(dtype_override.to_torch_dtype()) - - llama_instance = convert_linear_to_conv2d(llama_instance) - single_llama = SingleLlama(llama_instance.eval()) - - start_quantize_ts = time.time() - single_llama.quantize( - quant_dtype, - args=args, - custom_annotations=( - annotate_matmul_16a8w, - annotate_linear_16a8w_in_affine_layer, - ), - ) - end_quantize_ts = time.time() - print("single_llama.quantize(quant_dtype)", end_quantize_ts - start_quantize_ts) - single_llama.lowering_modules( - args.artifact, kv_type=kv_type, soc_model=get_soc_to_chipset_map()[args.model] - ) - end_lowering_ts = time.time() - print("Complete Compile", end_lowering_ts - end_quantize_ts) - - -def inference(args, pre_gen_pte=""): - workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama" - - if args.model_mode != "kv": - raise NotImplementedError( - f"model_mode {args.model_mode} is not implemented yet." - ) - - assert args.tokenizer_bin is not None, "Need tokenizer model for interence" - runner_args = " ".join( - [ - f"--model_path {pte_filename}.pte", - "--output_folder_path outputs", - f"--tokenizer_path {os.path.basename(args.tokenizer_bin)}", - f'--prompt "{args.prompt}"', - f"--seq_len {args.seq_len}", - f"--temperature {args.temperature}", - ] - ) - runner_cmd = " ".join( - [ - f"cd {workspace} &&", - f"./qnn_llama_runner {runner_args}", - ] - ) - - pte_path = ( - f"{pre_gen_pte}/{pte_filename}.pte" - if pre_gen_pte - else f"{args.artifact}/{pte_filename}.pte" - ) - adb = SimpleADB( - qnn_sdk=os.getenv("QNN_SDK_ROOT"), - build_path=f"{args.build_folder}", - pte_path=pte_path, - workspace=workspace, - device_id=args.device, - host_id=args.host, - soc_model=args.model, - shared_buffer=args.shared_buffer, - runner="examples/qualcomm/oss_scripts/llama2/qnn_llama_runner", - ) - # No pregen inputs, input_list is not required - adb.push(inputs=[], input_list="", files=[args.tokenizer_bin]) - adb.execute(custom_runner_cmd=runner_cmd) - - # collect output data - output_data_folder = f"{args.artifact}/outputs" - make_output_dir(output_data_folder) - outputs = [] - - def post_process(): - for f in sorted( - os.listdir(output_data_folder), key=lambda f: int(f.split("_")[1]) - ): - with codecs.open( - os.path.join(output_data_folder, f), - "r", - encoding="utf-8", - errors="replace", - ) as fdata: - outputs.append(fdata.read()) - - adb.pull(output_path=args.artifact, callback=post_process) - - if args.ip and args.port != -1: - with Client((args.ip, args.port)) as conn: - conn.send( - json.dumps( - { - "result": outputs, - } - ) - ) - else: - for idx, output in enumerate(outputs): - print(f"Results[{idx}]:\n{output}") - - -def main(): - parser = setup_common_args_and_variables() - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts and output by this example. Default ./llama2_qnn", - default="./llama2_qnn", - type=str, - ) - - parser.add_argument( - "-P", - "--ptq", - help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w and 16a4w.", - default="16a4w", - ) - - parser.add_argument( - "--checkpoint", - help="Pass llama2 checkpoint.", - required=True, - type=str, - ) - - parser.add_argument( - "--params", - help="Pass llama2 params json file.", - required=True, - type=str, - ) - - parser.add_argument( - "--tokenizer_bin", - help="Pass llama2 tokenizer binary.", - required=False, - type=str, - ) - - parser.add_argument( - "--tokenizer_model", - help="Pass llama2 tokenizer model.", - type=str, - default=None, - ) - - parser.add_argument( - "--prompt", - help="User prompts for llama2.", - required=True, - type=str, - ) - - parser.add_argument( - "--seq_len", - help="Ouput sequence length for llama2.", - default=128, - type=int, - ) - - parser.add_argument( - "--temperature", - help="Sampling temperature for llama2.", - default=0.8, - type=float, - ) - - parser.add_argument( - "-d", - "--dtype-override", - default="fp32", - type=str, - choices=["fp32", "fp16"], - help="Override the dtype of the model (default is the checkpoint dtype). Options: fp32", - ) - - parser.add_argument( - "--pre_gen_pte", - help="Run the Pre-generated llama2 in the given directory", - type=str, - ) - - parser.add_argument( - "--num_sharding", - type=int, - default=0, - help="Specify the number of splits by inserting the fallback custom op. The graph will be split evenly by layers.", - ) - - parser.add_argument( - "--model_mode", - help="Export and inference batch_prefill mode, kv mode or hybrid(TBD) mode", - default="kv", - choices=["batch_prefill", "kv", "hybrid"], - type=str, - ) - - args = parser.parse_args() - if args.compile_only and args.pre_gen_pte: - exit("Cannot set both compile_only and pre_gen_pte as true") - - if args.pre_gen_pte: - inference(args, args.pre_gen_pte) - exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") - - if args.compile_only: - compile(args) - exit(f"Finish compile_only and save to {args.artifact}") - - try: - compile(args) - inference(args) - except Exception as e: - if args.ip and args.port != -1: - with Client((args.ip, args.port)) as conn: - conn.send(json.dumps({"Error": str(e)})) - else: - raise Exception(e) - - -# flake8: noqa: C901 -if __name__ == "__main__": - main() diff --git a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp deleted file mode 100644 index 1e46f919dc..0000000000 --- a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) Qualcomm Innovation Center, Inc. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/** - * @file - * - * This tool can run ExecuTorch model files with Qualcomm AI Engine Direct. - * - * User could specify arguments like desired prompt, temperature, etc. - */ - -#include -#include -#include - -#include - -#include -#include - -DEFINE_string( - model_path, - "qnn_llama2.pte", - "Model serialized in flatbuffer format."); - -DEFINE_string( - output_folder_path, - "outputs", - "Executorch inference data output path."); - -DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff."); - -DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt."); - -DEFINE_double( - temperature, - 0.8f, - "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); - -DEFINE_int32( - seq_len, - 128, - "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens."); - -using executorch::runtime::Error; -using executorch::runtime::MemoryAllocator; -using executorch::runtime::MethodMeta; -using executorch::runtime::Result; - -int main(int argc, char** argv) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - - const char* tokenizer_path = FLAGS_tokenizer_path.c_str(); - const char* prompt = FLAGS_prompt.c_str(); - double temperature = FLAGS_temperature; - int32_t seq_len = FLAGS_seq_len; - - // create llama runner - example::Runner runner(FLAGS_model_path, tokenizer_path, temperature); - ET_CHECK_MSG(runner.load() == Error::Ok, "Runner failed to load method"); - - // MethodMeta describes the memory requirements of the method. - Result method_meta = runner.get_method_meta(); - ET_CHECK_MSG( - method_meta.ok(), - "Failed to get method_meta 0x%x", - (unsigned int)method_meta.error()); - ET_CHECK_MSG( - runner.mem_alloc(MemoryAllocator::kDefaultAlignment, seq_len) == - Error::Ok, - "Runner failed to allocate memory"); - - // generate tokens - std::string inference_output; - // prompt are determined by command line arguments - // pos_ids, atten_mask are infered inside runner - runner.generate(prompt, seq_len, [&](const std::string& piece) { - inference_output += piece; - }); - - size_t inference_index = 0; - auto output_file_name = FLAGS_output_folder_path + "/output_" + - std::to_string(inference_index++) + "_0.raw"; - std::ofstream fout(output_file_name.c_str()); - fout << inference_output; - fout.close(); - - return 0; -} diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp deleted file mode 100644 index 3f05512732..0000000000 --- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp +++ /dev/null @@ -1,671 +0,0 @@ -/* - * Copyright (c) Qualcomm Innovation Center, Inc. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// A simple llama2 runner that includes preprocessing and post processing logic. -// The module takes in a string as input and emits a string as output. - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -using executorch::aten::ScalarType; -using executorch::aten::SizesType; -using executorch::aten::Tensor; -using executorch::extension::from_blob; -using executorch::extension::Module; -using executorch::extension::TensorPtr; -using executorch::extension::llm::BPETokenizer; -using executorch::extension::llm::Sampler; -using executorch::extension::llm::time_in_ms; -using executorch::runtime::Error; -using executorch::runtime::EValue; -using executorch::runtime::MethodMeta; -using executorch::runtime::Result; -using executorch::runtime::TensorInfo; - -// TODO: Remove this usage of an internal-only function. -using executorch::runtime::internal::set_tensor_data; - -namespace example { - -namespace { -static constexpr auto kTopp = 0.9f; -void printReport(const Runner::Stats& stats); -std::string statsToJsonString(const Runner::Stats& stats); -} // namespace - -Runner::Runner( - const std::string& model_path, - const std::string& tokenizer_path, - const float temperature) - : module_(std::make_unique( - model_path, - Module::LoadMode::MmapUseMlockIgnoreErrors)), - tokenizer_path_(tokenizer_path), - model_path_(model_path), - temperature_(temperature) { - ET_LOG( - Info, - "Creating LLaMa runner: model_path=%s, tokenizer_path=%s", - model_path.c_str(), - tokenizer_path.c_str()); -} - -bool Runner::is_loaded() const { - return module_->is_loaded() && tokenizer_ && sampler_; -} - -Error Runner::load() { - if (is_loaded()) { - return Error::Ok; - } - stats_.model_load_start_ms = time_in_ms(); - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); - - // Read out metadata from the model - ET_LOG(Info, "Reading metadata from model"); - const auto method_names = module_->method_names(); - ET_CHECK_MSG(method_names.ok(), "Failed to read method names from model"); - model_methods_ = method_names.get(); - vocab_size_ = getMetadataHelper("get_vocab_size", 32000); - bos_id_ = getMetadataHelper("get_bos_id", 1); - eos_id_ = getMetadataHelper("get_eos_id", 2); - n_bos_ = getMetadataHelper("get_n_bos", 1); - n_eos_ = getMetadataHelper("get_n_eos", 1); - max_seq_len_ = getMetadataHelper("get_max_seq_len", 128); - head_dim_ = getMetadataHelper("get_head_dim", 32); - dim_ = getMetadataHelper("get_dim", 4096); - - // Load tokenizer - tokenizer_ = std::make_unique(); - tokenizer_->load(tokenizer_path_); - if (tokenizer_->bos_tok() != bos_id_) { - ET_LOG( - Error, - "Tokenizer's BOS id %lu does not match model's BOS id %ld, will override tokenizer's BOS.", - tokenizer_->bos_tok(), - bos_id_); - } - if (tokenizer_->eos_tok() != eos_id_) { - ET_LOG( - Error, - "Tokenizer's EOS id %lu does not match model's EOS id %ld, will override tokenizer's EOS.", - tokenizer_->eos_tok(), - eos_id_); - } - // Create sampler - sampler_ = std::make_unique( - vocab_size_, - temperature_, - kTopp, - static_cast(std::time(nullptr))); - stats_.model_load_end_ms = time_in_ms(); - - return Error::Ok; -} - -template -T Runner::getMetadataHelper(std::string method_name, T default_val) { - T res = default_val; - if (model_methods_.count(method_name)) { - Result> outputs = module_->execute(method_name); - if (outputs.ok()) { - std::vector outs = outputs.get(); - if (outs.size() > 0) { - res = outs[0].to(); - } - } - } else { - ET_LOG( - Info, - "The model does not contain %s method, using default value %lld", - method_name.c_str(), - (long long)default_val); - } - ET_LOG(Info, "%s: %lld", method_name.c_str(), (long long)res); - return res; -} - -template -int32_t Runner::logitsToToken(const Tensor& logits_tensor) { - T* logits = logits_tensor.mutable_data_ptr(); - - // Since the logits are for all tokens, get the last token probabilities - T* logits_last = logits; - return sampler_->sample(logits_last); -} - -// Given an input token. Set up the inputs for the model and execute a single -// step. Returning the logits tensor. -Result Runner::run_model_step( - int64_t input_token, - TensorPtr& token, - TensorPtr& atten_mask, - TensorPtr& start_pos, - std::vector& kv_tensors, - std::vector& kv_outputs) { - token->mutable_data_ptr()[0] = input_token; - - // inputs:[tokens, start_pos, atten_mask, k_cache, v_cache] - std::vector inputs = { - token, atten_mask, start_pos}; - inputs.insert(inputs.end(), kv_tensors.begin(), kv_tensors.end()); - auto outputs_res = module_->forward(inputs); - ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); - - // TODO: need to handle batch size != 1 - size_t v_offset = kv_outputs[0]->nbytes(); - size_t el_size = kv_outputs[0]->element_size(); - size_t k_input_step = (max_seq_len_ - 1) * el_size; - int k_tensors_end = kv_tensors.size() / 2; - // update k caches - for (int j = 0; j < k_tensors_end; ++j) { - uint8_t* input_addr = - static_cast(kv_tensors[j]->mutable_data_ptr()); - uint8_t* output_addr = - static_cast(kv_outputs[j]->mutable_data_ptr()); - // fill the output k values back - for (int src = 0, dst = k_input_step; src < kv_outputs[j]->nbytes(); - src += el_size, dst += k_input_step) { - input_addr[dst] = output_addr[src]; - } - char* new_inp_addr = io_mem_mgr_.update_k_caches_read(j, el_size); - // inputs - ET_CHECK_MSG( - set_tensor_data( - *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok, - "Failed to set input tensor when updating k_cache"); - } - // update v caches - for (int j = k_tensors_end, v_idx = 0; j < kv_tensors.size(); ++j, ++v_idx) { - // inputs - char* new_inp_addr = io_mem_mgr_.update_v_caches_read(v_idx, v_offset); - - ET_CHECK_MSG( - set_tensor_data( - *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok, - "Failed to set input tensor when updating v_cache"); - // outputs - char* new_out_addr = io_mem_mgr_.update_v_caches_write(v_idx, v_offset); - ET_CHECK_MSG( - set_tensor_data( - *kv_outputs[j], new_out_addr, kv_outputs[j]->nbytes()) == Error::Ok, - "Failed to set output tensor when updating v_cache"); - ET_CHECK_MSG( - module_->set_output(*kv_outputs[j], j + 1) == Error::Ok, - "Failed to set llama output data pointer"); - } - - // Bump start_pos by 1 - start_pos->mutable_data_ptr()[0]++; - - // update atten_mask - atten_mask->mutable_data_ptr() - [atten_mask->numel() - 1 - start_pos->const_data_ptr()[0]] = 0; - return outputs_res.get()[0].toTensor(); -} -// TODO: add overloaded method for on-device tokenize -Error Runner::generate( - const std::string& prompt, - int32_t seq_len, - std::function token_callback, - std::function stats_callback) { - ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); - ET_CHECK_MSG(is_loaded(), "Please invoke load method first"); - - // First token time only measures the time it takes to encode the prompt and - // return a response token. - stats_.inference_start_ms = time_in_ms(); - shouldStop_ = false; - - // Set the sequence length to the max seq length if not provided - seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_; - - Result> encode_res = - tokenizer_->encode(prompt, n_bos_, 0); - - ET_CHECK_OK_OR_RETURN_ERROR( - encode_res.error(), "Failed to encode prompt %s", prompt.c_str()); - - // encode the (string) prompt into tokens sequence - std::vector prompt_tokens = encode_res.get(); - int num_prompt_tokens = prompt_tokens.size(); - - ET_CHECK_MSG( - num_prompt_tokens < max_seq_len_, - "Max seq length exceeded - please increase max seq len value in static_llama.py"); - - ET_CHECK_MSG( - num_prompt_tokens < seq_len, - "Sequence length exceeded - please increase the seq_len value passed to generate()"); - - int32_t pos = 0, prev_token, cur_token = prompt_tokens[0]; - std::vector token_shape = {1, 1}; - - io_mem_mgr_.get_input_token_ptr()[0] = 0; - std::vector start_pos_shape = {1, 1}; - - float* atten_mask_ptr = - reinterpret_cast(io_mem_mgr_.get_atten_mask_ptr()); - std::fill(atten_mask_ptr, atten_mask_ptr + max_seq_len_, -255); - atten_mask_ptr[max_seq_len_ - 1] = 0; - - std::vector atten_mask_shape = {1, max_seq_len_}; - - std::vector logits_data_shape = {1, vocab_size_}; - - std::vector hidden_states_data_shape = {1, 1, dim_}; - - // initialize tensor wrappers - auto token = from_blob( - io_mem_mgr_.get_input_token_ptr(), token_shape, ScalarType::Int); - auto start_pos = from_blob( - io_mem_mgr_.get_pos_idx_ptr(), start_pos_shape, ScalarType::Int); - auto atten_mask = from_blob( - io_mem_mgr_.get_atten_mask_ptr(), atten_mask_shape, ScalarType::Float); - - std::vector kv_tensors, kv_outputs; - - Result method_meta = get_method_meta(); - size_t num_inputs = method_meta->num_inputs(); - int k_caches_num = (num_inputs - 3) / 2; - - // TODO: need to handle batch size != 1 - // k caches init - for (int input_index = 3, i = 0; input_index < k_caches_num + 3; - ++input_index, ++i) { - // inputs - Result tensor_meta = - method_meta->input_tensor_meta(input_index); - - auto tensor_shape = tensor_meta->sizes(); - std::vector sizes( - tensor_shape.data(), tensor_shape.data() + tensor_shape.size()); - kv_tensors.emplace_back(from_blob( - io_mem_mgr_.get_k_caches_read_ptr(i), - sizes, - tensor_meta->scalar_type())); - - // outpus - Result out_tensor_meta = method_meta->output_tensor_meta(i + 1); - tensor_shape = out_tensor_meta->sizes(); - sizes = std::vector{ - tensor_shape.data(), tensor_shape.data() + tensor_shape.size()}; - kv_outputs.emplace_back(from_blob( - io_mem_mgr_.get_k_caches_write_ptr(i), - sizes, - kv_tensors.back()->scalar_type())); - ET_CHECK_MSG( - module_->set_output(kv_outputs.back(), i + 1) == Error::Ok, - "Failed to set output tensor for kv cache"); - } - - // v caches init - for (int i = 0, input_index = k_caches_num + 3; input_index < num_inputs; - ++input_index, ++i) { - int output_index = i + k_caches_num + 1; - // inputs - Result tensor_meta = - method_meta->input_tensor_meta(input_index); - auto tensor_shape = tensor_meta->sizes(); - std::vector sizes( - tensor_shape.data(), tensor_shape.data() + tensor_shape.size()); - - kv_tensors.emplace_back(from_blob( - io_mem_mgr_.get_v_caches_read_ptr(i), - sizes, - tensor_meta->scalar_type())); - - // outputs - Result out_tensor_meta = - method_meta->output_tensor_meta(output_index); - tensor_shape = out_tensor_meta->sizes(); - sizes = std::vector{ - tensor_shape.data(), tensor_shape.data() + tensor_shape.size()}; - - kv_outputs.push_back(from_blob( - io_mem_mgr_.get_v_caches_write_ptr(i), - sizes, - kv_tensors.back()->scalar_type())); - ET_CHECK_MSG( - module_->set_output(kv_outputs.back(), output_index) == Error::Ok, - "Failed to set output tensor for llama block"); - } - - auto affine_logits = from_blob( - reinterpret_cast(io_mem_mgr_.get_logit_ptr()), - logits_data_shape, - ScalarType::Float); - ET_CHECK_MSG( - module_->set_output(affine_logits) == Error::Ok, - "Failed to set output tensor for affine module - logits"); - - // Start consuming user's prompts and generating new tokens - std::string final_output; - while (pos < seq_len - 1) { - // Run the model - auto logits_res = run_model_step( - cur_token, token, atten_mask, start_pos, kv_tensors, kv_outputs); - if (pos == num_prompt_tokens) { - stats_.first_token_ms = time_in_ms(); - } else if (pos == num_prompt_tokens - 1) { - stats_.prompt_eval_end_ms = time_in_ms(); - } - - ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error()); - Tensor& logits_tensor = logits_res.get(); - prev_token = cur_token; - long sample_start_time_ms = time_in_ms(); - - cur_token = logitsToToken(logits_tensor); - stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; - - // advance the state machine - if (pos < num_prompt_tokens - 1) { - // prefill, force the next token to be the next prompt token - cur_token = prompt_tokens[pos + 1]; - } - pos++; - - // print the token as string, decode it with the Tokenizer object - auto piece_res = tokenizer_->decode(prev_token, cur_token); - ET_CHECK(piece_res.ok()); - - if (token_callback) { - token_callback(piece_res.get()); - } - - if (shouldStop_) { - break; - } - - // data-dependent terminating condition: we have n_eos_ number of EOS - if (pos >= num_prompt_tokens && cur_token == eos_id_) { - ET_LOG(Info, "Reached to the end of generation"); - break; - } - } - stats_.inference_end_ms = time_in_ms(); - - if (pos == seq_len) { - ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); - } - - stats_.num_prompt_tokens = num_prompt_tokens; - stats_.num_generated_tokens = pos - num_prompt_tokens; - printReport(stats_); - if (stats_callback) { - stats_callback(stats_); - } - - return Error::Ok; -} - -namespace { -void printReport(const Runner::Stats& stats) { - printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str()); - - ET_LOG( - Info, - "\tPrompt Tokens: %" PRIu64 " Generated Tokens: %" PRIu64, - stats.num_prompt_tokens, - stats.num_generated_tokens); - - ET_LOG( - Info, - "\tModel Load Time:\t\t%f (seconds)", - ((double)(stats.model_load_end_ms - stats.model_load_start_ms) / - stats.SCALING_FACTOR_UNITS_PER_SECOND)); - double inference_time_ms = - (double)(stats.inference_end_ms - stats.inference_start_ms); - ET_LOG( - Info, - "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND, - - (stats.num_generated_tokens) / - (double)(stats.inference_end_ms - stats.inference_start_ms) * - stats.SCALING_FACTOR_UNITS_PER_SECOND); - double prompt_eval_time = - (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); - ET_LOG( - Info, - "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, - (stats.num_prompt_tokens) / prompt_eval_time * - stats.SCALING_FACTOR_UNITS_PER_SECOND); - - double eval_time = - (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); - ET_LOG( - Info, - "\t\tGenerated %" PRIu64 - " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - stats.num_generated_tokens, - eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, - stats.num_generated_tokens / eval_time * - stats.SCALING_FACTOR_UNITS_PER_SECOND); - - // Time to first token is measured from the start of inference, excluding - // model load time. - ET_LOG( - Info, - "\tTime to first generated token:\t%f (seconds)", - ((double)(stats.first_token_ms - stats.inference_start_ms) / - stats.SCALING_FACTOR_UNITS_PER_SECOND)); - - ET_LOG( - Info, - "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)", - stats.num_prompt_tokens + stats.num_generated_tokens, - (double)stats.aggregate_sampling_time_ms / - stats.SCALING_FACTOR_UNITS_PER_SECOND); -} - -std::string statsToJsonString(const Runner::Stats& stats) { - std::stringstream ss; - ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << "," - << "\"generated_tokens\":" << stats.num_generated_tokens << "," - << "\"model_load_start_ms\":" << stats.model_load_start_ms << "," - << "\"model_load_end_ms\":" << stats.model_load_end_ms << "," - << "\"inference_start_ms\":" << stats.inference_start_ms << "," - << "\"inference_end_ms\":" << stats.inference_end_ms << "," - << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << "," - << "\"first_token_ms\":" << stats.first_token_ms << "," - << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms - << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":" - << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}"; - return ss.str(); -} -} // namespace - -IoMemMgr::IoMemMgr(MethodMeta method_meta) { - method_meta_ = std::make_unique(method_meta); - init_io_info(); - compute_total_nbytes(); -} - -void IoMemMgr::init_io_info() { - set_tensor_meta(); - for (auto info : io_info_.tensor_info) { - info->size = info->tensor_meta->nbytes(); - info->rank = info->tensor_meta->sizes().size(); - info->shape.resize(info->rank); - for (int i = 0; i < info->rank; i++) { - info->shape[i] = - static_cast(info->tensor_meta->sizes().data()[i]); - } - info->dtype = info->tensor_meta->scalar_type(); - info->element_size = scalar_type_to_size[info->tensor_meta->scalar_type()]; - } -}; - -void IoMemMgr::set_tensor_meta() { - io_info_.input_token.tensor_meta = - std::make_unique(method_meta_->input_tensor_meta(0).get()); - io_info_.atten_mask.tensor_meta = - std::make_unique(method_meta_->input_tensor_meta(1).get()); - io_info_.pos_idx.tensor_meta = - std::make_unique(method_meta_->input_tensor_meta(2).get()); - - io_info_.k_caches_read.tensor_meta = - std::make_unique(method_meta_->input_tensor_meta(3).get()); - io_info_.k_caches_write.tensor_meta = - std::make_unique(method_meta_->output_tensor_meta(1).get()); - - io_info_.v_caches_read.tensor_meta = std::make_unique( - method_meta_->input_tensor_meta(method_meta_->num_inputs() - 1).get()); - io_info_.v_caches_write.tensor_meta = std::make_unique( - method_meta_->output_tensor_meta(method_meta_->num_outputs() - 1).get()); - - io_info_.logit.tensor_meta = - std::make_unique(method_meta_->output_tensor_meta(0).get()); -} - -void IoMemMgr::compute_total_nbytes() { - total_nbytes_ = io_info_.input_token.size + io_info_.pos_idx.size + - io_info_.atten_mask.size + io_info_.logit.size; - size_t num_heads = (method_meta_->num_inputs() - 3) / 2; - - // To update v cache via shifting pointer, v caches need a buffer with size - // of (max_seq_len_ - 1) * head_dim_. It is equivalent to one more cache - size_t num_v_cache = num_heads + 1; - // To update v cache via shifting pointer, k buffer need the size of - // max_seq_len - 1 - size_t k_buffer = io_info_.k_caches_read.size / io_info_.k_caches_write.size; - - // k_caches_read need a buffer with size of head_dim_ - total_nbytes_ += num_heads * io_info_.k_caches_read.size + k_buffer; - total_nbytes_ += num_heads * io_info_.k_caches_write.size; - total_nbytes_ += num_v_cache * io_info_.v_caches_read.size; - // Add a head dim size for the convinience of shifting ptr from the last - // non-used v cache write - total_nbytes_ += io_info_.v_caches_write.size; -} - -bool IoMemMgr::init_tensors() { - size_t cur_pos = input_token_pos_; - pos_idx_pos_ = cur_pos += io_info_.input_token.size; - atten_mask_pos_ = cur_pos += io_info_.pos_idx.size; - logit_pos_ = cur_pos += io_info_.atten_mask.size; - set_input_token_ptr(); - set_pos_idx_ptr(); - set_atten_mask_ptr(); - set_logit_ptr(); - - // set start point of kv caches - cur_pos += io_info_.logit.size; - - size_t num_heads = (method_meta_->num_inputs() - 3) / 2; - k_caches_read_pos_.resize(num_heads); - k_caches_write_pos_.resize(num_heads); - v_caches_read_pos_.resize(num_heads); - v_caches_write_pos_.resize(num_heads); - - for (int i = 0; i < num_heads; i++) { - set_k_caches_read(i, cur_pos); - cur_pos += io_info_.k_caches_read.size; - } - // add a size of k caches buffer - cur_pos += io_info_.k_caches_read.size / io_info_.k_caches_write.size; - for (int i = 0; i < num_heads; i++) { - set_k_caches_write(i, cur_pos); - cur_pos += io_info_.k_caches_write.size; - } - - for (int i = 0; i < num_heads; i++) { - set_v_caches_read(i, cur_pos); - set_v_caches_write(i, cur_pos + io_info_.v_caches_read.size); - cur_pos += io_info_.v_caches_read.size; - } - // add a caches as the b caches buffer - cur_pos += io_info_.v_caches_read.size; - return cur_pos <= total_nbytes_; -} - -void IoMemMgr::set_all_shifted_ptrs(size_t seq_len) { - auto iter_setter = [&](std::vector& cache, - size_t shift_size, - InfoAttrs& tensor_info) { - for (int i = 0; i < cache.size(); ++i) { - size_t pos = cache[i] + shift_size; - CustomMemTensorInfo info = { - ptr_, - ptr_ + pos, - pos, - tensor_info.size, - tensor_info.shape.data(), - tensor_info.rank, - tensor_info.dtype}; - QnnExecuTorchAddCustomMemTensorInfo(info); - } - }; - for (int i = 0; i < seq_len; ++i) { - iter_setter( - k_caches_read_pos_, - i * io_info_.k_caches_read.element_size, - io_info_.k_caches_read); - iter_setter( - v_caches_read_pos_, - i * io_info_.v_caches_write.size, - io_info_.v_caches_read); - iter_setter( - v_caches_write_pos_, - i * io_info_.v_caches_write.size, - io_info_.v_caches_write); - } -} - -void Runner::stop() { - shouldStop_ = true; -} - -Result Runner::get_method_meta() { - return module_->method_meta("forward"); -} - -Error Runner::mem_alloc(size_t alignment, size_t seq_len) { - Result method_meta_result = get_method_meta(); - io_mem_mgr_ = IoMemMgr(method_meta_result.get()); - ET_CHECK_MSG( - io_mem_mgr_.allocate(alignment), - "IoMemMgr failed to allocate custom memory"); - - ET_CHECK_MSG( - io_mem_mgr_.init_tensors(), - "IoMemMgr required more bytes than allocated bytes"); - - io_mem_mgr_.set_all_shifted_ptrs(seq_len); - // To register rpc_mem_handle from SharedBuffer - // Reset and re-init again to trigger registered function - module_.reset(); - module_ = std::make_unique( - model_path_, Module::LoadMode::MmapUseMlockIgnoreErrors); - ET_CHECK_MSG(load() == Error::Ok, "Runner failed to load method"); - - return Error::Ok; -} - -// explicit instantiation of template methods -template int64_t Runner::getMetadataHelper( - std::string method_name, - int64_t default_val); -template bool Runner::getMetadataHelper( - std::string method_name, - bool default_val); - -} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.h b/examples/qualcomm/oss_scripts/llama2/runner/runner.h deleted file mode 100644 index aa0e5eb0ec..0000000000 --- a/examples/qualcomm/oss_scripts/llama2/runner/runner.h +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright (c) Qualcomm Innovation Center, Inc. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// A simple llama2 runner that includes preprocessing and post processing logic. -// The module takes in a string as input and emits a string as output. - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -class RpcMemAllocator { - public: - RpcMemAllocator(QnnMemDescriptor shared_buffer_type) - : shared_buffer_type_(shared_buffer_type){}; - bool allocate(size_t bytes, size_t alignment) { - ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment); - if (ptr_ == nullptr) { - ET_LOG( - Info, - "Allocate Rpc mem falied, fallback to nromal ptr: bytes=%zu, alignment=%zu", - bytes, - alignment); - input_data_.resize(bytes); - ptr_ = input_data_.data(); - } - return ptr_ != nullptr; - } - - ~RpcMemAllocator() { - if (shared_buffer_type_ == QnnMemDescriptor::kIon || - shared_buffer_type_ == QnnMemDescriptor::kCustom) { - if (ptr_ != nullptr) { - QnnExecuTorchFreeCustomMem(ptr_); - } - } - } - - void* GetPtr() { - return ptr_; - } - - private: - QnnMemDescriptor shared_buffer_type_; - void* ptr_{nullptr}; - std::vector input_data_; - std::vector tensor_base_addrs_; -}; - -#define DEFINE_IOMEMMGR_ACCESSOR(name) \ - size_t get_##name##_pos() const { \ - return name##_pos_; \ - } \ - char* get_##name##_ptr() const { \ - return reinterpret_cast(ptr_) + name##_pos_; \ - } \ - char* set_##name##_ptr() { \ - CustomMemTensorInfo info = { \ - ptr_, \ - ptr_ + name##_pos_, \ - name##_pos_, \ - io_info_.name.size, \ - io_info_.name.shape.data(), \ - io_info_.name.rank, \ - io_info_.name.dtype}; \ - QnnExecuTorchAddCustomMemTensorInfo(info); \ - return reinterpret_cast(ptr_) + name##_pos_; \ - } - -#define DEFINE_IOMEMMGR_VEC_ACCESSOR(name) \ - const std::vector& get_##name##_pos_vec() const { \ - return name##_pos_; \ - } \ - char* get_##name##_ptr(int idx) { \ - return ptr_ + name##_pos_[idx]; \ - } \ - char* set_##name(int idx, size_t pos) { \ - name##_pos_[idx] = pos; \ - CustomMemTensorInfo info = { \ - ptr_, \ - ptr_ + name##_pos_[idx], \ - name##_pos_[idx], \ - io_info_.name.size, \ - io_info_.name.shape.data(), \ - io_info_.name.rank, \ - io_info_.name.dtype}; \ - QnnExecuTorchAddCustomMemTensorInfo(info); \ - return reinterpret_cast(ptr_) + pos; \ - } \ - char* update_##name(int idx, size_t shift_size) { \ - name##_pos_[idx] += shift_size; \ - return reinterpret_cast(ptr_) + name##_pos_[idx]; \ - } - -namespace example { -class IoMemMgr { - public: - // Allocate a big memory which is capable to contain all IO of all modules - IoMemMgr(){}; - IoMemMgr(executorch::runtime::MethodMeta method_meta); - - struct InfoAttrs { - std::unique_ptr tensor_meta; - size_t size = 0; - std::vector shape; - uint32_t rank; - size_t element_size; - executorch::aten::ScalarType dtype; - }; - - struct IoInfo { - InfoAttrs input_token; - InfoAttrs atten_mask; - InfoAttrs pos_idx; - InfoAttrs k_caches_read; - InfoAttrs k_caches_write; - InfoAttrs v_caches_read; - InfoAttrs v_caches_write; - InfoAttrs logit; - std::vector tensor_info{ - &input_token, - &atten_mask, - &pos_idx, - &k_caches_read, - &k_caches_write, - &v_caches_read, - &v_caches_write, - &logit, - }; - }; - - bool allocate(size_t alignment) { - bool ret = rpc_mem_allocator.allocate(total_nbytes_, alignment); - ptr_ = reinterpret_cast(rpc_mem_allocator.GetPtr()); - return ret; - } - bool init_tensors(); - - char* get_custom_mem_ptr() { - return ptr_; - } - - // Pointers of k cache read, v cache read and write are shifted every step. - // Set them first to register mem handle during qnn delegation init. - void set_all_shifted_ptrs(size_t max_seq_len); - - DEFINE_IOMEMMGR_ACCESSOR(atten_mask); - DEFINE_IOMEMMGR_ACCESSOR(input_token); - DEFINE_IOMEMMGR_ACCESSOR(pos_idx); - DEFINE_IOMEMMGR_ACCESSOR(logit); - - DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_read); - DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_write); - DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_read); - DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_write); - - private: - size_t total_nbytes_{0}; - char* ptr_{nullptr}; - void compute_total_nbytes(); - void set_tensor_meta(); - void init_io_info(); - - size_t atten_mask_pos_; - size_t input_token_pos_{0}; - size_t logit_pos_; - size_t pos_idx_pos_; - std::vector k_caches_read_pos_; - std::vector k_caches_write_pos_; - std::vector v_caches_read_pos_; - std::vector v_caches_write_pos_; - - IoInfo io_info_; - std::unique_ptr method_meta_; - RpcMemAllocator rpc_mem_allocator{QnnMemDescriptor::kCustom}; - std::unordered_map scalar_type_to_size = - { - {executorch::aten::ScalarType::Int, sizeof(int32_t)}, - {executorch::aten::ScalarType::Float, sizeof(float)}, - {executorch::aten::ScalarType::Char, sizeof(int8_t)}, - {executorch::aten::ScalarType::Short, sizeof(int16_t)}, - {executorch::aten::ScalarType::Byte, sizeof(uint8_t)}, - {executorch::aten::ScalarType::Bits16, sizeof(uint16_t)}, - }; -}; - -class Runner { - public: - explicit Runner( - const std::string& model_path, - const std::string& tokenizer_path, - const float temperature = 0.8f); - - struct Stats { - // Scaling factor for timestamps - in this case, we use ms. - const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; - // Time stamps for the different stages of the execution - // model_load_start_ms: Start of model loading. - long model_load_start_ms; - // model_load_end_ms: End of model loading. - long model_load_end_ms; - // inference_start_ms: Immediately after the model is loaded (or we check - // for model load), measure the inference time. - long inference_start_ms; - // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right - // before the inference loop starts - long prompt_eval_end_ms; - // first_token: Timestamp when the first generated token is emitted - long first_token_ms; - // inference_end_ms: End of inference/generation. - long inference_end_ms; - // Keep a running total of the time spent in sampling. - long aggregate_sampling_time_ms; - // Token count from prompt - int64_t num_prompt_tokens; - // Token count from generated (total - prompt) - int64_t num_generated_tokens; - }; - - bool is_loaded() const; - executorch::runtime::Error load(); - executorch::runtime::Error mem_alloc(size_t alignment, size_t seq_len); - executorch::runtime::Error generate( - const std::string& prompt, - int32_t seq_len, - std::function token_callback = {}, - std::function stats_callback = {}); - void stop(); - executorch::runtime::Result - get_method_meta(); - - private: - // metadata - template - T getMetadataHelper(std::string method_name, T default_val); - template - int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor); - executorch::runtime::Result run_model_step( - int64_t input_token, - ::executorch::extension::TensorPtr& token, - ::executorch::extension::TensorPtr& atten_mask, - ::executorch::extension::TensorPtr& start_pos, - std::vector<::executorch::extension::TensorPtr>& kv_tensors, - std::vector<::executorch::extension::TensorPtr>& kv_outputs); - // metadata - int32_t vocab_size_; - int64_t bos_id_; - int64_t eos_id_; - int32_t n_bos_; - int32_t n_eos_; - int32_t max_seq_len_; - int32_t head_dim_; - int32_t dim_; - std::unordered_set model_methods_; - std::unique_ptr module_; - std::string tokenizer_path_; - std::string model_path_; - float temperature_; - std::unique_ptr tokenizer_; - std::unique_ptr sampler_; - bool shouldStop_{false}; - Stats stats_; - IoMemMgr io_mem_mgr_; -}; - -} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama3_2/README.md b/examples/qualcomm/oss_scripts/llama3_2/README.md deleted file mode 100644 index 51de982b1b..0000000000 --- a/examples/qualcomm/oss_scripts/llama3_2/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Summary - -## Overview -This file provides instructions to run LLAMA3.2 1B and 3B (WIP) with different parameters via the Qualcomm HTP backend. In LLAMA3.2, we offer the following modes to execute the model: - -Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for generating the initial sequence of tokens (usually the user's prompt). - -KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt. - -Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens. - -## Instructions -### Note -1. For hybrid mode, the export time will be longer and can take up to 2-4 hours to complete. -2. When exporting a hybrid mode model, please ensure the device has at least 80 GB of memory and swap space. - -### Step 1: Setup -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. -2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. - -### Step 2: Prepare Model -1. Follow the [instructions](https://www.llama.com/) to download models. -At the end of this step, users should have the following files ready: consolidated.00.pth, params.json, and tokenizer.model. - -### Step3: Run default examples using hybrid mode. -Default example using hybrid mode. -```bash -python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 -``` - -If you would like to compile the model only, we have provided the flag `--compile_only`. -```bash -python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --compile_only -``` - -On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model. -```bash -python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} -``` \ No newline at end of file diff --git a/examples/xnnpack/README.md b/examples/xnnpack/README.md index a519d935b5..624dd33b84 100644 --- a/examples/xnnpack/README.md +++ b/examples/xnnpack/README.md @@ -86,7 +86,7 @@ After exporting the XNNPACK Delegated model, we can now try running it with exam cd executorch # Get a clean cmake-out directory -./install_requirements.sh --clean +./install_executorch.sh --clean mkdir cmake-out # Configure cmake diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS index 1cf3c3e2fc..ba300f7032 100644 --- a/exir/passes/TARGETS +++ b/exir/passes/TARGETS @@ -11,6 +11,7 @@ python_library( ":const_prop_pass", ":debug_handle_generator_pass", ":external_constants_pass", + ":init_mutable_pass", ":insert_write_back_for_buffers_pass", ":memory_format_ops_pass", ":memory_planning_pass", diff --git a/extension/pybindings/README.md b/extension/pybindings/README.md index 8c1adf2c22..767c0266ed 100644 --- a/extension/pybindings/README.md +++ b/extension/pybindings/README.md @@ -9,7 +9,7 @@ pip install . --no-build-isolation Or when installing the rest of dependencies: ```bash -install_requirements.sh --pybind +install_executorch.sh --pybind ``` # Link Backends @@ -25,7 +25,7 @@ pip install . --no-build-isolation Similarly, when installing the rest of dependencies: ```bash -install_requirements.sh --pybind coreml mps xnnpack +install_executorch.sh --pybind coreml mps xnnpack ``` ## Functions diff --git a/extension/training/README.md b/extension/training/README.md index 44195471a7..f6f8d5139a 100644 --- a/extension/training/README.md +++ b/extension/training/README.md @@ -230,7 +230,7 @@ After exporting the model for training, we can now try learning using CMake. We cd executorch # Get a clean cmake-out directory -./install_requirements.sh --clean +./install_executorch.sh --clean mkdir cmake-out # Configure cmake diff --git a/install_requirements.bat b/install_executorch.bat similarity index 84% rename from install_requirements.bat rename to install_executorch.bat index 4cfe4b21c4..863ade7bdb 100644 --- a/install_requirements.bat +++ b/install_executorch.bat @@ -16,6 +16,6 @@ if "%PYTHON_EXECUTABLE%"=="" ( ) ) -"%PYTHON_EXECUTABLE%" install_requirements.py %* +"%PYTHON_EXECUTABLE%" install_executorch.py %* -exit /b %ERRORLEVEL% \ No newline at end of file +exit /b %ERRORLEVEL% diff --git a/install_executorch.py b/install_executorch.py new file mode 100644 index 0000000000..37ef3185ad --- /dev/null +++ b/install_executorch.py @@ -0,0 +1,142 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024-25 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import argparse +import glob +import itertools +import os +import shutil +import subprocess +import sys + +from install_requirements import ( + install_requirements, + python_is_compatible, + TORCH_NIGHTLY_URL, +) + + +def clean(): + print("Cleaning build artifacts...") + print("Cleaning pip-out/...") + shutil.rmtree("pip-out/", ignore_errors=True) + dirs = glob.glob("cmake-out*/") + glob.glob("cmake-android-out/") + for d in dirs: + print(f"Cleaning {d}...") + shutil.rmtree(d, ignore_errors=True) + print("Done cleaning build artifacts.") + + +VALID_PYBINDS = ["coreml", "mps", "xnnpack"] + + +def main(args): + if not python_is_compatible(): + sys.exit(1) + + # Parse options. + + EXECUTORCH_BUILD_PYBIND = "" + CMAKE_ARGS = os.getenv("CMAKE_ARGS", "") + use_pytorch_nightly = True + + parser = argparse.ArgumentParser() + parser.add_argument( + "--pybind", + action="append", + nargs="+", + help="one or more of coreml/mps/xnnpack, or off", + ) + parser.add_argument( + "--clean", + action="store_true", + help="clean build artifacts and pip-out instead of installing", + ) + parser.add_argument( + "--use-pt-pinned-commit", + action="store_true", + help="build from the pinned PyTorch commit instead of nightly", + ) + args = parser.parse_args(args) + if args.pybind: + # Flatten list of lists. + args.pybind = list(itertools.chain(*args.pybind)) + if "off" in args.pybind: + if len(args.pybind) != 1: + raise Exception( + f"Cannot combine `off` with other pybinds: {args.pybind}" + ) + EXECUTORCH_BUILD_PYBIND = "OFF" + else: + for pybind_arg in args.pybind: + if pybind_arg not in VALID_PYBINDS: + raise Exception( + f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}" + ) + EXECUTORCH_BUILD_PYBIND = "ON" + CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON" + + if args.clean: + clean() + return + + if args.use_pt_pinned_commit: + # This option is used in CI to make sure that PyTorch build from the pinned commit + # is used instead of nightly. CI jobs wouldn't be able to catch regression from the + # latest PT commit otherwise + use_pytorch_nightly = False + + install_requirements(use_pytorch_nightly) + + # If --pybind is not set explicitly for backends (e.g., --pybind xnnpack) + # or is not turned off explicitly (--pybind off) + # then install XNNPACK by default. + if EXECUTORCH_BUILD_PYBIND == "": + EXECUTORCH_BUILD_PYBIND = "ON" + CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON" + + # Use ClangCL on Windows. + # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible + # mode. Using it on Windows to avoid compiler compatibility issues for MSVC. + if os.name == "nt": + CMAKE_ARGS += " -T ClangCL" + + # + # Install executorch pip package. This also makes `flatc` available on the path. + # The --extra-index-url may be necessary if pyproject.toml has a dependency on a + # pre-release or nightly version of a torch package. + # + + # Set environment variables + os.environ["EXECUTORCH_BUILD_PYBIND"] = EXECUTORCH_BUILD_PYBIND + os.environ["CMAKE_ARGS"] = CMAKE_ARGS + + # Run the pip install command + subprocess.run( + [ + sys.executable, + "-m", + "pip", + "install", + ".", + "--no-build-isolation", + "-v", + "--extra-index-url", + TORCH_NIGHTLY_URL, + ], + check=True, + ) + + +if __name__ == "__main__": + # Before doing anything, cd to the directory containing this script. + os.chdir(os.path.dirname(os.path.abspath(__file__))) + if not python_is_compatible(): + sys.exit(1) + + main(sys.argv[1:]) diff --git a/install_executorch.sh b/install_executorch.sh new file mode 100755 index 0000000000..ec8cad1226 --- /dev/null +++ b/install_executorch.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Before doing anything, cd to the directory containing this script. +cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null || /bin/true +./run_python_script.sh ./install_executorch.py "$@" diff --git a/install_requirements.py b/install_requirements.py index 409460ca10..4450367ff4 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -5,20 +5,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - import argparse -import glob -import itertools -import os import platform import re -import shutil import subprocess import sys -# Before doing anything, cd to the directory containing this script. -os.chdir(os.path.dirname(os.path.abspath(__file__))) - def python_is_compatible(): # Scrape the version range from pyproject.toml, which should be in the current directory. @@ -65,20 +57,6 @@ def python_is_compatible(): return True -def clean(): - print("Cleaning build artifacts...") - print("Cleaning pip-out/...") - shutil.rmtree("pip-out/", ignore_errors=True) - dirs = glob.glob("cmake-out*/") + glob.glob("cmake-android-out/") - for d in dirs: - print(f"Cleaning {d}...") - shutil.rmtree(d, ignore_errors=True) - print("Done cleaning build artifacts.") - - -VALID_PYBINDS = ["coreml", "mps", "xnnpack"] - - # The pip repository that hosts nightly torch packages. TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" @@ -104,34 +82,15 @@ def install_requirements(use_pytorch_nightly): if use_pytorch_nightly else "torchvision" ), # For testing. - "typing-extensions", ] - # pip packages needed to run examples. - # TODO: Make each example publish its own requirements.txt EXAMPLES_REQUIREMENTS = [ - "timm==1.0.7", f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchaudio", - "torchsr==1.0.4", - "transformers==4.47.1", - ] - - # pip packages needed for development. - DEVEL_REQUIREMENTS = [ - "cmake", # For building binary targets. - "pip>=23", # For building the pip package. - "pyyaml", # Imported by the kernel codegen tools. - "setuptools>=63", # For building the pip package. - "tomli", # Imported by extract_sources.py when using python < 3.11. - "wheel", # For building the pip package archive. - "zstd", # Imported by resolve_buck.py. ] # Assemble the list of requirements to actually install. # TODO: Add options for reducing the number of requirements. - REQUIREMENTS_TO_INSTALL = ( - EXIR_REQUIREMENTS + DEVEL_REQUIREMENTS + EXAMPLES_REQUIREMENTS - ) + REQUIREMENTS_TO_INSTALL = EXIR_REQUIREMENTS + EXAMPLES_REQUIREMENTS # Install the requirements. `--extra-index-url` tells pip to look for package # versions on the provided URL if they aren't available on the default URL. @@ -141,6 +100,8 @@ def install_requirements(use_pytorch_nightly): "-m", "pip", "install", + "-r", + "requirements-examples.txt", *REQUIREMENTS_TO_INSTALL, "--extra-index-url", TORCH_NIGHTLY_URL, @@ -160,6 +121,8 @@ def install_requirements(use_pytorch_nightly): "-m", "pip", "install", + # Without --no-build-isolation, setup.py can't find the torch module. + "--no-build-isolation", *LOCAL_REQUIREMENTS, ], check=True, @@ -167,104 +130,21 @@ def install_requirements(use_pytorch_nightly): def main(args): - if not python_is_compatible(): - sys.exit(1) - - # Parse options. - - EXECUTORCH_BUILD_PYBIND = "" - CMAKE_ARGS = os.getenv("CMAKE_ARGS", "") - CMAKE_BUILD_ARGS = os.getenv("CMAKE_BUILD_ARGS", "") - use_pytorch_nightly = True - parser = argparse.ArgumentParser() - parser.add_argument( - "--pybind", - action="append", - nargs="+", - help="one or more of coreml/mps/xnnpack, or off", - ) - parser.add_argument( - "--clean", - action="store_true", - help="clean build artifacts and pip-out instead of installing", - ) parser.add_argument( "--use-pt-pinned-commit", action="store_true", help="build from the pinned PyTorch commit instead of nightly", ) args = parser.parse_args(args) - if args.pybind: - # Flatten list of lists. - args.pybind = list(itertools.chain(*args.pybind)) - if "off" in args.pybind: - if len(args.pybind) != 1: - raise Exception( - f"Cannot combine `off` with other pybinds: {args.pybind}" - ) - EXECUTORCH_BUILD_PYBIND = "OFF" - else: - for pybind_arg in args.pybind: - if pybind_arg not in VALID_PYBINDS: - raise Exception( - f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}" - ) - EXECUTORCH_BUILD_PYBIND = "ON" - CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON" - - if args.clean: - clean() - return - - if args.use_pt_pinned_commit: - # This option is used in CI to make sure that PyTorch build from the pinned commit - # is used instead of nightly. CI jobs wouldn't be able to catch regression from the - # latest PT commit otherwise - use_pytorch_nightly = False - - install_requirements(use_pytorch_nightly) - - # If --pybind is not set explicitly for backends (e.g., --pybind xnnpack) - # or is not turned off explicitly (--pybind off) - # then install XNNPACK by default. - if EXECUTORCH_BUILD_PYBIND == "": - EXECUTORCH_BUILD_PYBIND = "ON" - CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON" - - # Use ClangCL on Windows. - # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible - # mode. Using it on Windows to avoid compiler compatibility issues for MSVC. - if os.name == "nt": - CMAKE_ARGS += " -T ClangCL" - - # - # Install executorch pip package. This also makes `flatc` available on the path. - # The --extra-index-url may be necessary if pyproject.toml has a dependency on a - # pre-release or nightly version of a torch package. - # - - # Set environment variables - os.environ["EXECUTORCH_BUILD_PYBIND"] = EXECUTORCH_BUILD_PYBIND - os.environ["CMAKE_ARGS"] = CMAKE_ARGS - os.environ["CMAKE_BUILD_ARGS"] = CMAKE_BUILD_ARGS - - # Run the pip install command - subprocess.run( - [ - sys.executable, - "-m", - "pip", - "install", - ".", - "--no-build-isolation", - "-v", - "--extra-index-url", - TORCH_NIGHTLY_URL, - ], - check=True, - ) + install_requirements(use_pytorch_nightly=not bool(args.use_pt_pinned_commit)) if __name__ == "__main__": + import os + + # Before doing anything, cd to the directory containing this script. + os.chdir(os.path.dirname(os.path.abspath(__file__))) + if not python_is_compatible(): + sys.exit(1) main(sys.argv[1:]) diff --git a/install_requirements.sh b/install_requirements.sh index 6caaa880e6..ef156cd020 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -7,19 +7,4 @@ # Before doing anything, cd to the directory containing this script. cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null || /bin/true - -# Find the names of the python tools to use. -if [[ -z $PYTHON_EXECUTABLE ]]; -then - if [[ -z $CONDA_DEFAULT_ENV ]] || [[ $CONDA_DEFAULT_ENV == "base" ]] || [[ ! -x "$(command -v python)" ]]; - then - PYTHON_EXECUTABLE=python3 - else - PYTHON_EXECUTABLE=python - fi -fi - -$PYTHON_EXECUTABLE ./install_requirements.py "$@" - -# Exit with the same status as the python script. -exit $? +./run_python_script.sh ./install_requirements.py "$@" diff --git a/requirements-examples.txt b/requirements-examples.txt new file mode 100644 index 0000000000..d4126a178a --- /dev/null +++ b/requirements-examples.txt @@ -0,0 +1,5 @@ +# pip packages needed to run examples. +# TODO: Make each example publish its own requirements.txt +timm == 1.0.7 +torchsr == 1.0.4 +transformers ==4.47.1 diff --git a/run_python_script.sh b/run_python_script.sh new file mode 100755 index 0000000000..4f9a74ec36 --- /dev/null +++ b/run_python_script.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Before doing anything, cd to the directory containing this script. +cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null || /bin/true + +# Find the names of the python tools to use. +if [[ -z $PYTHON_EXECUTABLE ]]; +then + if [[ -z $CONDA_DEFAULT_ENV ]] || [[ $CONDA_DEFAULT_ENV == "base" ]] || [[ ! -x "$(command -v python)" ]]; + then + PYTHON_EXECUTABLE=python3 + else + PYTHON_EXECUTABLE=python + fi +fi + +SCRIPT="$1"; shift +$PYTHON_EXECUTABLE $SCRIPT "$@" + +# Exit with the same status as the python script. +exit $? diff --git a/third-party/ao b/third-party/ao index 2e032c6b0d..11333ba2cb 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit 2e032c6b0de960dee554dcb08126ace718b14c6d +Subproject commit 11333ba2cb5c4e792bc4f5c0d70c12991f972008