Copy tokenizer.model in Android benchmark spec #485
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: android-perf | |
on: | |
schedule: | |
- cron: 0 0 * * * | |
pull_request: | |
paths: | |
- .github/workflows/android-perf.yml | |
- extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 | |
push: | |
branches: | |
- main | |
paths: | |
- .github/workflows/android-perf.yml | |
- extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 | |
# Note: GitHub has an upper limit of 10 inputs | |
workflow_dispatch: | |
inputs: | |
models: | |
description: Models to be benchmarked | |
required: false | |
type: string | |
default: stories110M | |
devices: | |
description: Target devices to run benchmark | |
required: false | |
type: string | |
default: samsung_galaxy_s22 | |
benchmark_configs: | |
description: The list of configs used the benchmark | |
required: false | |
type: string | |
workflow_call: | |
inputs: | |
models: | |
description: Models to be benchmarked | |
required: false | |
type: string | |
default: stories110M | |
devices: | |
description: Target devices to run benchmark | |
required: false | |
type: string | |
default: samsung_galaxy_s22 | |
benchmark_configs: | |
description: The list of configs used the benchmark | |
required: false | |
type: string | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
cancel-in-progress: true | |
jobs: | |
set-parameters: | |
runs-on: ubuntu-22.04 | |
outputs: | |
benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }} | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
submodules: 'false' | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10' | |
- name: Set parameters | |
id: set-parameters | |
shell: bash | |
env: | |
# Separate default values from the workflow dispatch. To ensure defaults are accessible | |
# during scheduled runs and to provide flexibility for different defaults between | |
# on-demand and periodic benchmarking. | |
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }} | |
CRON_DEFAULT_DEVICES: samsung_galaxy_s22 | |
run: | | |
set -eux | |
MODELS="${{ inputs.models }}" | |
if [ -z "$MODELS" ]; then | |
MODELS="$CRON_DEFAULT_MODELS" | |
fi | |
DEVICES="${{ inputs.devices }}" | |
if [ -z "$DEVICES" ]; then | |
DEVICES="$CRON_DEFAULT_DEVICES" | |
fi | |
PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py \ | |
--os "android" \ | |
--models $MODELS \ | |
--devices $DEVICES | |
prepare-test-specs: | |
runs-on: linux.2xlarge | |
needs: set-parameters | |
strategy: | |
matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} | |
fail-fast: false | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Prepare the spec | |
id: prepare | |
shell: bash | |
env: | |
BENCHMARK_CONFIG: ${{ toJSON(matrix) }} | |
working-directory: extension/benchmark/android/benchmark | |
run: | | |
set -eux | |
# The model will be exported in the next step to this S3 path | |
MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip" | |
# We could write a script to properly use jinja here, but there is only one variable, | |
# so let's just sed it | |
sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2 | |
BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g') | |
# The config for this benchmark runs, we save it in the test spec so that it can be fetched | |
# later by the upload script | |
sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' android-llm-device-farm-test-spec.yml.j2 | |
cp android-llm-device-farm-test-spec.yml.j2 android-llm-device-farm-test-spec.yml | |
# Just print the test spec for debugging | |
cat android-llm-device-farm-test-spec.yml | |
# Save the benchmark configs so that we can use it later in the dashboard | |
echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json" | |
echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT | |
- name: Upload the spec | |
uses: seemethere/upload-artifact-s3@v5 | |
with: | |
s3-bucket: gha-artifacts | |
s3-prefix: | | |
${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }} | |
retention-days: 1 | |
if-no-files-found: error | |
path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml | |
- name: Update the benchmark configs | |
uses: seemethere/upload-artifact-s3@v5 | |
with: | |
s3-bucket: gha-artifacts | |
s3-prefix: | | |
${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/ | |
retention-days: 1 | |
if-no-files-found: error | |
path: extension/benchmark/android/benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json | |
export-models: | |
name: export-models | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
needs: set-parameters | |
secrets: inherit | |
strategy: | |
matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} | |
fail-fast: false | |
with: | |
runner: linux.2xlarge.memory | |
docker-image: executorch-ubuntu-22.04-qnn-sdk | |
submodules: 'true' | |
timeout: 60 | |
upload-artifact: android-models | |
upload-artifact-to-s3: true | |
secrets-env: EXECUTORCH_HF_TOKEN | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
echo "::group::Setting up dev environment" | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
if [[ ${{ matrix.config }} == *"qnn"* ]]; then | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh | |
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh | |
fi | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" | |
# Install requirements for export_llama | |
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh | |
pip install -U "huggingface_hub[cli]" | |
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN | |
pip install accelerate sentencepiece | |
pip list | |
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.config }} | |
echo "::endgroup::" | |
echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}" | |
BUILD_MODE="cmake" | |
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then | |
# HuggingFace model. Assume the pattern is always like "<org>/<repo>" | |
HF_MODEL_REPO=${{ matrix.model }} | |
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}" | |
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then | |
# Llama models on Hugging Face | |
if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then | |
# SpinQuant | |
# Download prequantized chceckpoint from Hugging Face | |
DOWNLOADED_PATH=$( | |
bash .ci/scripts/download_hf_hub.sh \ | |
--model_id "${HF_MODEL_REPO}" \ | |
--files "tokenizer.model" "params.json" "consolidated.00.pth" | |
) | |
# Export using ExecuTorch's model definition | |
python -m examples.models.llama.export_llama \ | |
--model "llama3_2" \ | |
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ | |
--params "${DOWNLOADED_PATH}/params.json" \ | |
--use_sdpa_with_kv_cache \ | |
-X \ | |
--xnnpack-extended-ops \ | |
--preq_mode 8da4w_output_8da8w \ | |
--preq_group_size 32 \ | |
--max_seq_length 2048 \ | |
--output_name "${OUT_ET_MODEL_NAME}.pte" \ | |
-kv \ | |
-d fp32 \ | |
--preq_embedding_quantize 8,0 \ | |
--use_spin_quant native \ | |
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' | |
ls -lh "${OUT_ET_MODEL_NAME}.pte" | |
elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then | |
# QAT + LoRA | |
# Download prequantized chceckpoint from Hugging Face | |
DOWNLOADED_PATH=$( | |
bash .ci/scripts/download_hf_hub.sh \ | |
--model_id "${HF_MODEL_REPO}" \ | |
--files "tokenizer.model" "params.json" "consolidated.00.pth" | |
) | |
# Export using ExecuTorch's model definition | |
python -m examples.models.llama.export_llama \ | |
--model "llama3_2" \ | |
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ | |
--params "${DOWNLOADED_PATH}/params.json" \ | |
-qat \ | |
-lora 16 \ | |
--preq_mode 8da4w_output_8da8w \ | |
--preq_group_size 32 \ | |
--preq_embedding_quantize 8,0 \ | |
--use_sdpa_with_kv_cache \ | |
-kv \ | |
-X \ | |
--xnnpack-extended-ops \ | |
-d fp32 \ | |
--max_seq_length 2048 \ | |
--output_name "${OUT_ET_MODEL_NAME}.pte" \ | |
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' | |
ls -lh "${OUT_ET_MODEL_NAME}.pte" | |
elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then | |
# Original BF16 version, without any quantization | |
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") | |
python -m examples.models.llama.export_llama \ | |
--model "llama3_2" \ | |
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ | |
--params "${DOWNLOADED_PATH}/params.json" \ | |
-kv \ | |
--use_sdpa_with_kv_cache \ | |
-X \ | |
-d bf16 \ | |
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ | |
--output_name="${OUT_ET_MODEL_NAME}.pte" | |
ls -lh "${OUT_ET_MODEL_NAME}.pte" | |
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then | |
export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 | |
export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/ | |
export PYTHONPATH=$(pwd)/.. | |
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") | |
python -m examples.qualcomm.oss_scripts.llama3_2.llama -- \ | |
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ | |
--params "${DOWNLOADED_PATH}/params.json" \ | |
--tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \ | |
--compile_only \ | |
--ptq 16a4w \ | |
-m SM8650 \ | |
--model_size 1B \ | |
--model_mode kv \ | |
--prompt "Once" | |
OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script | |
find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \; | |
ls -lh "${OUT_ET_MODEL_NAME}.pte" | |
else | |
# By default, test with the Hugging Face model and the xnnpack recipe | |
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") | |
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" | |
ls -lh "${OUT_ET_MODEL_NAME}.pte" | |
fi | |
else | |
echo "Unsupported model ${{ matrix.model }}" | |
exit 1 | |
fi | |
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model" | |
ls -lh model.zip | |
mkdir -p "${ARTIFACTS_DIR_NAME}" | |
mv model.zip "${ARTIFACTS_DIR_NAME}" | |
elif [[ ${{ matrix.model }} == "llama" ]]; then | |
# Install requirements for export_llama | |
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh | |
# Test llama2 | |
if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then | |
DELEGATE_CONFIG="xnnpack+custom+qe" | |
elif [[ ${{ matrix.config }} == *"qnn"* ]]; then | |
DELEGATE_CONFIG="qnn" | |
else | |
echo "Unsupported delegate ${{ matrix.config }}" | |
exit 1 | |
fi | |
DTYPE="fp32" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \ | |
-model "${{ matrix.model }}" \ | |
-build_tool "${BUILD_MODE}" \ | |
-dtype "${DTYPE}" \ | |
-mode "${DELEGATE_CONFIG}" \ | |
-upload "${ARTIFACTS_DIR_NAME}" | |
else | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \ | |
"${{ matrix.model }}" \ | |
"${BUILD_MODE}" \ | |
"${{ matrix.config }}" \ | |
"${ARTIFACTS_DIR_NAME}" | |
fi | |
echo "::endgroup::" | |
build-benchmark-app: | |
name: build-benchmark-app | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
needs: set-parameters | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-clang12-android | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
upload-artifact: android-apps | |
upload-artifact-to-s3: true | |
script: | | |
set -eux | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake | |
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh | |
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh | |
export ANDROID_ABIS="arm64-v8a" | |
PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} | |
# Let's see how expensive this job is, we might want to tone it down by running it periodically | |
benchmark-on-device: | |
if: always() | |
permissions: | |
id-token: write | |
contents: read | |
uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main | |
needs: | |
- set-parameters | |
- prepare-test-specs | |
- build-benchmark-app | |
- export-models | |
strategy: | |
matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} | |
fail-fast: false | |
with: | |
# Due to scheduling a job may be pushed beyond the default 60m threshold | |
timeout: 120 | |
device-type: android | |
runner: linux.2xlarge | |
test-infra-ref: '' | |
# This is the ARN of ExecuTorch project on AWS | |
project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 | |
device-pool-arn: ${{ matrix.device_arn }} | |
android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk | |
android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk | |
test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/android-llm-device-farm-test-spec.yml | |
upload-benchmark-results: | |
needs: | |
- benchmark-on-device | |
if: always() | |
runs-on: linux.2xlarge | |
environment: upload-benchmark-results | |
permissions: | |
id-token: write | |
contents: read | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
submodules: false | |
- name: Authenticate with AWS | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results | |
# The max duration enforced by the server side | |
role-duration-seconds: 18000 | |
aws-region: us-east-1 | |
- name: Setup conda | |
uses: pytorch/test-infra/.github/actions/setup-miniconda@main | |
with: | |
python-version: '3.10' | |
- name: Download the list of artifacts from S3 | |
env: | |
ARTIFACTS_S3_DIR: s3://gha-artifacts/device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts/ | |
shell: bash | |
run: | | |
set -eux | |
${CONDA_RUN} python -mpip install awscli==1.32.18 | |
mkdir -p artifacts | |
pushd artifacts | |
${CONDA_RUN} aws s3 sync "${ARTIFACTS_S3_DIR}" . | |
popd | |
ls -lah artifacts | |
- name: Download the list of benchmark configs from S3 | |
env: | |
BENCHMARK_CONFIGS_DIR: s3://gha-artifacts/${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/ | |
shell: bash | |
run: | | |
set -eux | |
mkdir -p benchmark-configs | |
pushd benchmark-configs | |
${CONDA_RUN} aws s3 sync "${BENCHMARK_CONFIGS_DIR}" . | |
popd | |
ls -lah benchmark-configs | |
- name: Extract the benchmark results JSON | |
shell: bash | |
run: | | |
set -eux | |
mkdir -p benchmark-results | |
for ARTIFACTS_BY_JOB in artifacts/*.json; do | |
[ -f "${ARTIFACTS_BY_JOB}" ] || break | |
echo "${ARTIFACTS_BY_JOB}" | |
${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \ | |
--artifacts "${ARTIFACTS_BY_JOB}" \ | |
--output-dir benchmark-results \ | |
--repo ${{ github.repository }} \ | |
--head-branch ${{ github.head_ref || github.ref_name }} \ | |
--workflow-name "${{ github.workflow }}" \ | |
--workflow-run-id ${{ github.run_id }} \ | |
--workflow-run-attempt ${{ github.run_attempt }} \ | |
--benchmark-configs benchmark-configs | |
done | |
for SCHEMA in v2 v3; do | |
for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do | |
cat "${BENCHMARK_RESULTS}" | |
echo | |
done | |
done | |
# TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration | |
- name: Upload the benchmark results (v2) | |
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main | |
with: | |
benchmark-results-dir: benchmark-results/v2 | |
dry-run: false | |
schema-version: v2 | |
- name: Upload the benchmark results (v3) | |
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main | |
with: | |
benchmark-results-dir: benchmark-results/v3 | |
dry-run: false | |
schema-version: v3 | |
github-token: ${{ secrets.GITHUB_TOKEN }} |