Skip to content

Commit

Permalink
Merge branch 'main' into change-969691
Browse files Browse the repository at this point in the history
  • Loading branch information
AdrianLundell authored Jan 20, 2025
2 parents 9bb1b79 + fedb035 commit 0a30d97
Show file tree
Hide file tree
Showing 74 changed files with 1,039 additions and 2,244 deletions.
6 changes: 3 additions & 3 deletions .ci/scripts/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@ retry () {
}

clean_executorch_install_folders() {
./install_requirements.sh --clean
./install_executorch.sh --clean
}

install_executorch() {
which pip
# Install executorch, this assumes that Executorch is checked out in the
# current directory.
if [[ "${1:-}" == "use-pt-pinned-commit" ]]; then
./install_requirements.sh --pybind xnnpack --use-pt-pinned-commit
./install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
else
./install_requirements.sh --pybind xnnpack
./install_executorch.sh --pybind xnnpack
fi
# Just print out the list of packages for debugging
pip list
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/apple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
paths:
- .ci/scripts/setup-ios.sh
- .github/workflows/apple.yml
- install_requirements.sh
- install_executorch.sh
- backends/apple/**
- build/build_apple_frameworks.sh
- build/build_apple_llm_demo.sh
Expand Down
13 changes: 8 additions & 5 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ jobs:
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
# install pybind
bash install_requirements.sh --pybind xnnpack
bash install_executorch.sh --pybind xnnpack
# install Llava requirements
bash examples/models/llama/install_requirements.sh
Expand Down Expand Up @@ -333,6 +333,9 @@ jobs:

unittest-arm:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down Expand Up @@ -433,7 +436,7 @@ jobs:
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
# install pybind
bash install_requirements.sh --pybind xnnpack
bash install_executorch.sh --pybind xnnpack
# install phi-3-mini requirements
bash examples/models/phi-3-mini/install_requirements.sh
Expand All @@ -460,7 +463,7 @@ jobs:
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
# install pybind
bash install_requirements.sh --pybind xnnpack
bash install_executorch.sh --pybind xnnpack
# install llama requirements
bash examples/models/llama/install_requirements.sh
Expand All @@ -487,7 +490,7 @@ jobs:
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
# install pybind
bash install_requirements.sh --pybind xnnpack
bash install_executorch.sh --pybind xnnpack
# install llama requirements
bash examples/models/llama/install_requirements.sh
Expand All @@ -514,7 +517,7 @@ jobs:
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
# install pybind
bash install_requirements.sh --pybind xnnpack
bash install_executorch.sh --pybind xnnpack
# install llama requirements
bash examples/models/llama/install_requirements.sh
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ jobs:
test-arm-backend-delegation:
name: test-arm-backend-delegation
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down Expand Up @@ -159,6 +162,9 @@ jobs:
test-arm-reference-delegation:
name: test-arm-reference-delegation
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down
2 changes: 1 addition & 1 deletion backends/apple/mps/setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successf
### [Optional] Run the generated model directly using pybind
1. Make sure `pybind` MPS support was installed:
```bash
./install_requirements.sh --pybind mps
./install_executorch.sh --pybind mps
```
2. Run the `mps_example` script to trace the model and run it directly from python:
```bash
Expand Down
2 changes: 1 addition & 1 deletion backends/cadence/build_cadence_fusionG3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ unset XTENSA_CORE
export XTENSA_CORE=FCV_FG3GP
git submodule sync
git submodule update --init
./install_requirements.sh
./install_executorch.sh

rm -rf cmake-out

Expand Down
2 changes: 1 addition & 1 deletion backends/cadence/build_cadence_hifi4.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ unset XTENSA_CORE
export XTENSA_CORE=nxp_rt600_RI23_11_newlib
git submodule sync
git submodule update --init
./install_requirements.sh
./install_executorch.sh

rm -rf cmake-out

Expand Down
11 changes: 6 additions & 5 deletions backends/cadence/fusion_g3/operators/op_dequantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ void check_dequantize_per_tensor_args(

ET_CHECK_MSG(
input.scalar_type() == dtype,
"input.scalar_type() %" PRId8 " is not matching dtype argumenta:",
static_cast<int8_t>(input.scalar_type()));
"input.scalar_type() %s is not matching dtype arguments:",
::executorch::runtime::toString(input.scalar_type()));

if (out_dtype.has_value()) {
ET_CHECK_MSG(
Expand Down Expand Up @@ -561,11 +561,12 @@ Tensor& dequantize_per_tensor_out(
const Tensor& input,
double scale,
int64_t zero_point,
int64_t quant_min,
int64_t quant_max,
__ET_UNUSED int64_t quant_min,
__ET_UNUSED int64_t quant_max,
ScalarType dtype,
::executorch::aten::optional<ScalarType> out_dtype,
Tensor& out) {
constexpr ScalarType out_dtype = ScalarType::Float;

#ifdef OP_ARG_CHECK
torch::executor::Error err = resize_tensor(out, input.sizes());
ET_CHECK_MSG(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
if (((_h + d0 * _wh - p0) >= 0) &&
((_h + d0 * _wh - p0) < h) &&
((_w + d1 * _ww - p1) >= 0) &&
((_w + d1 * _ww - p1 < w))) {
((_w + d1 * _ww - p1) < w)) {
int ioff =
(_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1);
int woff = _wh * ww + _ww;
Expand Down
6 changes: 0 additions & 6 deletions backends/qualcomm/_passes/insert_requantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,9 @@ def _single_output_annotation(
requantize_dict = n.meta.pop(QCOM_REQUANTIZE)
# {quant_attr: user_node_name_list}
group_quant_attr_dict = self._invert_dict(requantize_dict)
# TODO: If users of the node contain output node,
# we replace the node with to_copy op. However, it would
# be problem when the node has multiple to_copy ops
add_output = len(group_quant_attr_dict) == 1

for hashable_quant_attr, user_nodes in group_quant_attr_dict.items():
user_nodes_copy = user_nodes.copy()
if add_output:
user_nodes_copy.append("output")
self._insert_to_copy(gm, n, dict(hashable_quant_attr), user_nodes_copy)

def _insert(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
Expand Down
74 changes: 68 additions & 6 deletions backends/qualcomm/quantizer/custom_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,80 @@
QuantizationConfig,
)
from executorch.exir.dialects._ops import ops as exir_ops
from torch.ao.quantization.observer import MinMaxObserver
from torch.ao.quantization.observer import FixedQParamsObserver, MinMaxObserver
from torch.ao.quantization.quantizer import (
QuantizationAnnotation,
QuantizationSpec,
SharedQuantizationSpec,
)
from torch.fx import Node


def annotate_matmul_16a8w( # noqa: C901
gm: torch.fx.GraphModule, traverse_input1=True
) -> None:
def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None:
def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
input_qspec_map = {}
input_act = node.args[0]
input_spec = quantization_config.input_activation
input_qspec_map[input_act] = input_spec

weight = node.args[1]
input_qspec_map[weight] = quantization_config.weight

node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
input_qspec_map=input_qspec_map,
output_qspec=quantization_config.output_activation,
_annotated=True,
)

quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config(
torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
)
for node in gm.graph.nodes:
if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default:
if "nn_module_stack" in node.meta:
module_values_list = list(node.meta["nn_module_stack"].values())
full_qualified_name = module_values_list[-1][0]
if full_qualified_name == "output.conv":
annotate_conv2d(
node, quantization_config=quantization_config_16a8w_per_channel
)


def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
for node in gm.graph.nodes:
if node.op == "output":
for index, prefill_output in enumerate(node.args[0]):
kv_quant_attr = kv_quant_attrs[index]
fixed_observer = FixedQParamsObserver.with_args(
scale=kv_quant_attr[0],
zero_point=kv_quant_attr[1],
quant_min=kv_quant_attr[2],
quant_max=kv_quant_attr[3],
dtype=kv_quant_attr[4],
qscheme=torch.torch.per_tensor_affine,
)

fixed_output_spec = QuantizationSpec(
quant_min=kv_quant_attr[2],
quant_max=kv_quant_attr[3],
dtype=kv_quant_attr[4],
ch_axis=0,
observer_or_fake_quant_ctr=fixed_observer,
)

input_qspec_map = {}
for input in prefill_output.args:
if isinstance(input, Node):
input_qspec_map[input] = fixed_output_spec

prefill_output.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
input_qspec_map=input_qspec_map,
output_qspec=fixed_output_spec,
_annotated=True,
)


def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901
"""
This function is specific for matmul op 16a8w.
For k, we will tag such as the below, and
Expand Down Expand Up @@ -142,8 +205,7 @@ def annotate_matmul_input1(node: Node):
for node in gm.graph.nodes:
if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
annotate_matmul(node, quantization_config_16a8w)
if traverse_input1:
annotate_matmul_input1(node.args[1])
annotate_matmul_input1(node.args[1])


def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901
Expand Down
4 changes: 3 additions & 1 deletion backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3280,7 +3280,7 @@ def test_stories_single_llama(self):

cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama2/llama.py",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
"--artifact",
self.artifact_dir,
"--build_folder",
Expand All @@ -3307,6 +3307,8 @@ def test_stories_single_llama(self):
"16a4w",
"--temperature",
"0",
"--llama_model",
"stories110m",
]
if self.host:
cmds.extend(["--host", self.host])
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/docs/android_demo.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ First, build and install ExecuTorch libraries, then build the LLaMA runner
binary using the Android NDK toolchain.

```shell
./install_requirements.sh --clean
./install_executorch.sh --clean
(mkdir cmake-android-out && \
cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
Expand Down
4 changes: 3 additions & 1 deletion backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
// 64 is the number of threads in the local wg
$num_shared = 64 * TILE_SIZE * TILE_SIZE
shared ivec2 pos_shared[${num_shared}];

/*
* Computes a 2D pointwise convolution of an NxN output tile. Calculating an
Expand Down
37 changes: 32 additions & 5 deletions backends/vulkan/test/op_tests/linear_weight_int4_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,38 @@ at::Tensor linear_weight_int4_reference_impl(
const size_t ndim = original_x_size.size();
const int64_t out_features = weights_4x2.size(0);
const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]});
const at::Tensor packed_weights =
at::_convert_weight_to_int4pack(weights_4x2, inner_k_tiles);
at::Tensor out = at::_weight_int4pack_mm(
x_flattened, packed_weights, groupsize, scales_and_zeros);
at::Tensor out = at::_weight_int4pack_mm_for_cpu(
x_flattened, weights_4x2, groupsize, scales_and_zeros);
std::vector<int64_t> out_shape(
original_x_size.begin(), original_x_size.end());
out_shape.at(ndim - 1) = out_features;
return out.reshape(out_shape);
}

at::Tensor unpack_weights_4x2(const at::Tensor& weights_4x2) {
std::vector<int64_t> weights_shape(weights_4x2.sizes().vec());
weights_shape[1] *= 2;

at::Tensor weights_unpacked =
at::empty(weights_shape, at::device(at::kCPU).dtype(at::kInt));

const int64_t N = weights_unpacked.size(0);
const int64_t K = weights_unpacked.size(1);

for (int n = 0; n < N; n++) {
for (int k = 0; k < K; k += 2) {
const uint8_t packed_val = weights_4x2[n][k / 2].item().to<uint8_t>();
const uint8_t second_val = packed_val & 0x0F;
const uint8_t first_val = (packed_val & 0xF0) >> 4;

weights_unpacked[n][k] = int(first_val);
weights_unpacked[n][k + 1] = int(second_val);
}
}

return weights_unpacked;
}

at::Tensor dequantize_and_linear(
const at::Tensor& x,
const at::Tensor& weights_4x2,
Expand Down Expand Up @@ -91,13 +113,18 @@ void test_reference_linear_int4(
at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
at::Tensor weights_4x2 =
at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
at::Tensor weights_int = unpack_weights_4x2(weights_4x2);

const int k_groups = K / group_size;
at::Tensor scales_and_zeros =
at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat));

at::Tensor out = linear_weight_int4_reference_impl(
x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles);
x,
at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size),
group_size,
scales_and_zeros,
inner_k_tiles);

at::Tensor out_ref = dequantize_and_linear(
x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles);
Expand Down
2 changes: 1 addition & 1 deletion backends/xnnpack/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ After exporting the XNNPACK Delegated model, we can now try running it with exam
cd executorch

# Get a clean cmake-out directory
./install_requirements.sh --clean
./install_executorch.sh --clean
mkdir cmake-out

# Configure cmake
Expand Down
2 changes: 1 addition & 1 deletion build/test_ios.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ say "Installing Requirements"

pip install --upgrade cmake pip setuptools wheel zstd

./install_requirements.sh --pybind coreml mps xnnpack
./install_executorch.sh --pybind coreml mps xnnpack
export PATH="$(realpath third-party/flatbuffers/cmake-out):$PATH"
./build/install_flatc.sh

Expand Down
Loading

0 comments on commit 0a30d97

Please sign in to comment.