From c45aecda12da41e3b6f6ac9537d2f41a72709a5a Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:39:31 +0200 Subject: [PATCH] Misc CI updates and multi-platform support (#233) --- .../build_and_publish_docker_images.yaml | 2 +- .../{check_quality.yaml => quality.yaml} | 16 +++---- .github/workflows/release.yaml | 11 +++-- .github/workflows/security.yml | 9 +++- .github/workflows/test_api_cpu.yaml | 4 +- .github/workflows/test_api_misc.yaml | 4 +- .github/workflows/test_api_rocm.yaml | 4 +- .../test_cli_cpu_neural_compressor.yaml | 4 +- .../workflows/test_cli_cpu_onnxruntime.yaml | 4 +- .github/workflows/test_cli_cpu_openvino.yaml | 4 +- .github/workflows/test_cli_cpu_py_txi.yaml | 4 +- .github/workflows/test_cli_cpu_pytorch.yaml | 4 +- .../workflows/test_cli_cuda_onnxruntime.yaml | 2 +- .github/workflows/test_cli_cuda_py_txi.yaml | 4 +- .../test_cli_cuda_pytorch_multi_gpu.yaml | 2 +- .../test_cli_cuda_pytorch_single_gpu.yaml | 2 +- ...est_cli_cuda_tensorrt_llm_single_gpu.yaml} | 6 +-- .../test_cli_cuda_torch_ort_multi_gpu.yaml | 4 +- .../test_cli_cuda_torch_ort_single_gpu.yaml | 4 +- .../test_cli_cuda_vllm_single_gpu.yaml | 4 +- .github/workflows/test_cli_misc.yaml | 11 ++--- .../test_cli_rocm_pytorch_multi_gpu.yaml | 2 +- .../test_cli_rocm_pytorch_single_gpu.yaml | 2 +- .../update_llm_perf_cuda_pytorch.yaml | 4 +- .../update_llm_perf_leaderboard.yaml | 2 +- .../generators/task_generator.py | 45 ++++++++++++++++++- .../launchers/device_isolation_utils.py | 10 ++++- tests/test_cli.py | 11 ++++- 28 files changed, 124 insertions(+), 61 deletions(-) rename .github/workflows/{check_quality.yaml => quality.yaml} (62%) rename .github/workflows/{test_cli_cuda_tensorrt_llm.yaml => test_cli_cuda_tensorrt_llm_single_gpu.yaml} (90%) diff --git a/.github/workflows/build_and_publish_docker_images.yaml b/.github/workflows/build_and_publish_docker_images.yaml index 57234691..20983712 100644 --- a/.github/workflows/build_and_publish_docker_images.yaml +++ b/.github/workflows/build_and_publish_docker_images.yaml @@ -20,7 +20,7 @@ env: IMAGE_NAME: ${{ github.repository }} jobs: - build_and_publish_docker_image: + publish: strategy: fail-fast: true matrix: diff --git a/.github/workflows/check_quality.yaml b/.github/workflows/quality.yaml similarity index 62% rename from .github/workflows/check_quality.yaml rename to .github/workflows/quality.yaml index 5b28d054..92ee85af 100644 --- a/.github/workflows/check_quality.yaml +++ b/.github/workflows/quality.yaml @@ -1,28 +1,22 @@ -name: Quality Check +name: Quality Checks on: - workflow_dispatch: push: - branches: - - main - pull_request: - branches: - - main concurrency: cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: ${{ github.workflow }}-${{ github.ref }} jobs: - run_quality_checks: + quality: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index de3596ca..6b377207 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -3,15 +3,20 @@ name: PYPI Release on: workflow_dispatch: +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + jobs: - build_and_publish: + release: runs-on: ubuntu-latest + steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 3c2dc94d..28bd53ce 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -3,6 +3,10 @@ name: Security Checks on: push: +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + permissions: contents: read @@ -10,7 +14,8 @@ jobs: secrets: runs-on: ubuntu-latest steps: - - shell: bash + - name: Set up environment variables + shell: bash run: | if [ "${{ github.event_name }}" == "push" ]; then echo "depth=$(($(jq length <<< '${{ toJson(github.event.commits) }}') + 2))" >> $GITHUB_ENV @@ -20,10 +25,12 @@ jobs: echo "depth=$((${{ github.event.pull_request.commits }}+2))" >> $GITHUB_ENV echo "branch=${{ github.event.pull_request.head.ref }}" >> $GITHUB_ENV fi + - name: Checkout code uses: actions/checkout@v4 with: ref: ${{env.branch}} fetch-depth: ${{env.depth}} + - name: Scan for secrets uses: trufflesecurity/trufflehog@main diff --git a/.github/workflows/test_api_cpu.yaml b/.github/workflows/test_api_cpu.yaml index eacf227f..a2b072b3 100644 --- a/.github/workflows/test_api_cpu.yaml +++ b/.github/workflows/test_api_cpu.yaml @@ -31,10 +31,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml index 575e1910..4baaf0d7 100644 --- a/.github/workflows/test_api_misc.yaml +++ b/.github/workflows/test_api_misc.yaml @@ -31,10 +31,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml index 0800745b..1e198da4 100644 --- a/.github/workflows/test_api_rocm.yaml +++ b/.github/workflows/test_api_rocm.yaml @@ -33,8 +33,8 @@ jobs: runs-on: [single-gpu, amd-gpu, mi250, ci] steps: - - name: Checkout code - uses: actions/checkout@v3 + - name: Checkout + uses: actions/checkout@v4 - name: Set target devices run: | diff --git a/.github/workflows/test_cli_cpu_neural_compressor.yaml b/.github/workflows/test_cli_cpu_neural_compressor.yaml index 8974fa4b..40b4e268 100644 --- a/.github/workflows/test_cli_cpu_neural_compressor.yaml +++ b/.github/workflows/test_cli_cpu_neural_compressor.yaml @@ -31,10 +31,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/test_cli_cpu_onnxruntime.yaml b/.github/workflows/test_cli_cpu_onnxruntime.yaml index ca79c90c..ca1e868e 100644 --- a/.github/workflows/test_cli_cpu_onnxruntime.yaml +++ b/.github/workflows/test_cli_cpu_onnxruntime.yaml @@ -31,10 +31,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/test_cli_cpu_openvino.yaml b/.github/workflows/test_cli_cpu_openvino.yaml index fe4d42eb..3dfc4635 100644 --- a/.github/workflows/test_cli_cpu_openvino.yaml +++ b/.github/workflows/test_cli_cpu_openvino.yaml @@ -31,10 +31,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/test_cli_cpu_py_txi.yaml b/.github/workflows/test_cli_cpu_py_txi.yaml index cbda5380..9366dc4d 100644 --- a/.github/workflows/test_cli_cpu_py_txi.yaml +++ b/.github/workflows/test_cli_cpu_py_txi.yaml @@ -31,10 +31,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/test_cli_cpu_pytorch.yaml b/.github/workflows/test_cli_cpu_pytorch.yaml index 0b190ce4..48c07fc6 100644 --- a/.github/workflows/test_cli_cpu_pytorch.yaml +++ b/.github/workflows/test_cli_cpu_pytorch.yaml @@ -31,10 +31,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml index 3f1d363e..355296dc 100644 --- a/.github/workflows/test_cli_cuda_onnxruntime.yaml +++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Run tests uses: addnab/docker-run-action@v3 diff --git a/.github/workflows/test_cli_cuda_py_txi.yaml b/.github/workflows/test_cli_cuda_py_txi.yaml index d3ed57c0..5a85c583 100644 --- a/.github/workflows/test_cli_cuda_py_txi.yaml +++ b/.github/workflows/test_cli_cuda_py_txi.yaml @@ -31,10 +31,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/test_cli_cuda_pytorch_multi_gpu.yaml b/.github/workflows/test_cli_cuda_pytorch_multi_gpu.yaml index 885505c5..91555647 100644 --- a/.github/workflows/test_cli_cuda_pytorch_multi_gpu.yaml +++ b/.github/workflows/test_cli_cuda_pytorch_multi_gpu.yaml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Run tests uses: addnab/docker-run-action@v3 diff --git a/.github/workflows/test_cli_cuda_pytorch_single_gpu.yaml b/.github/workflows/test_cli_cuda_pytorch_single_gpu.yaml index 0fc0af99..0a404bd0 100644 --- a/.github/workflows/test_cli_cuda_pytorch_single_gpu.yaml +++ b/.github/workflows/test_cli_cuda_pytorch_single_gpu.yaml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Run tests uses: addnab/docker-run-action@v3 diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm_single_gpu.yaml similarity index 90% rename from .github/workflows/test_cli_cuda_tensorrt_llm.yaml rename to .github/workflows/test_cli_cuda_tensorrt_llm_single_gpu.yaml index 73c73d9c..91e5f4ff 100644 --- a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml +++ b/.github/workflows/test_cli_cuda_tensorrt_llm_single_gpu.yaml @@ -1,4 +1,4 @@ -name: CLI CUDA TensorRT-LLM Tests +name: CLI CUDA TensorRT-LLM Single-GPU Tests on: workflow_dispatch: @@ -29,12 +29,12 @@ env: IMAGE: huggingface/optimum-nvidia:latest jobs: - cli_cuda_tensorrt_llm_tests: + cli_cuda_tensorrt_llm_single_gpu_tests: runs-on: [single-gpu, nvidia-gpu, a10, ci] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Run tests uses: addnab/docker-run-action@v3 diff --git a/.github/workflows/test_cli_cuda_torch_ort_multi_gpu.yaml b/.github/workflows/test_cli_cuda_torch_ort_multi_gpu.yaml index ddfba8b3..2fd43e4e 100644 --- a/.github/workflows/test_cli_cuda_torch_ort_multi_gpu.yaml +++ b/.github/workflows/test_cli_cuda_torch_ort_multi_gpu.yaml @@ -29,12 +29,12 @@ env: IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda-ort jobs: - build_image_and_run_cli_cuda_torch_ort_multi_gpu_tests: + run_cli_cuda_torch_ort_multi_gpu_tests: runs-on: [multi-gpu, nvidia-gpu, 4-a10, ci] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Run tests uses: addnab/docker-run-action@v3 diff --git a/.github/workflows/test_cli_cuda_torch_ort_single_gpu.yaml b/.github/workflows/test_cli_cuda_torch_ort_single_gpu.yaml index 0a4e334c..db7b496a 100644 --- a/.github/workflows/test_cli_cuda_torch_ort_single_gpu.yaml +++ b/.github/workflows/test_cli_cuda_torch_ort_single_gpu.yaml @@ -29,12 +29,12 @@ env: IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda-ort jobs: - build_image_and_run_cli_cuda_torch_ort_single_gpu_tests: + run_cli_cuda_torch_ort_single_gpu_tests: runs-on: [single-gpu, nvidia-gpu, a10, ci] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Run tests uses: addnab/docker-run-action@v3 diff --git a/.github/workflows/test_cli_cuda_vllm_single_gpu.yaml b/.github/workflows/test_cli_cuda_vllm_single_gpu.yaml index 66ce017a..48c60de7 100644 --- a/.github/workflows/test_cli_cuda_vllm_single_gpu.yaml +++ b/.github/workflows/test_cli_cuda_vllm_single_gpu.yaml @@ -1,4 +1,4 @@ -name: CLI CUDA vLLM Tests +name: CLI CUDA vLLM Single-GPU Tests on: workflow_dispatch: @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Run tests uses: addnab/docker-run-action@v3 diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml index 93202a14..13f91b50 100644 --- a/.github/workflows/test_cli_misc.yaml +++ b/.github/workflows/test_cli_misc.yaml @@ -28,21 +28,22 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest] - python: ["3.8", "3.9", "3.10"] + os: [ubuntu-latest, "macos-latest", windows-latest] + python: ["3.8", "3.12"] runs-on: ${{ matrix.os }} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} - - name: Install packages + - name: Install Linux packages + if: matrix.os == 'ubuntu-latest' run: | sudo apt-get update sudo apt-get install -y numactl diff --git a/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml index aee4853e..acf519c8 100644 --- a/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml +++ b/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set target devices run: | diff --git a/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml index 10674b68..ca6096b2 100644 --- a/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml +++ b/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set target devices run: | diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index 254ad8d0..495fea20 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -13,7 +13,7 @@ env: IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda jobs: - build_image_and_run_benchmarks: + run_benchmarks: strategy: fail-fast: false matrix: @@ -24,7 +24,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Run benchmarks uses: addnab/docker-run-action@v3 diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml index f1e68454..10ed80c9 100644 --- a/.github/workflows/update_llm_perf_leaderboard.yaml +++ b/.github/workflows/update_llm_perf_leaderboard.yaml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.10 uses: actions/setup-python@v3 diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index 78e2a754..a2443073 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -1,11 +1,14 @@ +import logging import random import string from abc import ABC from typing import Tuple -# TODO: drop torch dependency and use numpy instead ? +# TODO: drop torch dependency and use numpy instead import torch +LOGGER = logging.getLogger("generators") + class TaskGenerator(ABC): def __init__(self, shapes, with_labels: bool): @@ -14,18 +17,58 @@ def __init__(self, shapes, with_labels: bool): @staticmethod def generate_random_integers(min_value: int, max_value: int, shape: Tuple[int]): + if min_value is None: + LOGGER.warning("min_value is None, setting it to 0") + min_value = 0 + + if max_value is None: + LOGGER.warning("max_value is None, setting it to 2") + max_value = 2 + + if None in shape: + LOGGER.warning("shape contains None, setting it to (1, 1)") + shape = (1, 1) + return torch.randint(min_value, max_value, shape) @staticmethod def generate_random_floats(min_value: float, max_value: float, shape: Tuple[int]): + if min_value is None: + LOGGER.warning("min_value is None, setting it to 0") + min_value = 0 + + if max_value is None: + LOGGER.warning("max_value is None, setting it to 1") + max_value = 1 + + if None in shape: + LOGGER.warning("shape contains None, setting it to (1, 1)") + shape = (1, 1) + return torch.rand(shape) * (max_value - min_value) + min_value @staticmethod def generate_ranges(start: int, stop: int, shape: Tuple[int]): + if start is None: + LOGGER.warning("start is None, setting it to 0") + start = 0 + + if stop is None: + LOGGER.warning("stop is None, setting it to 1") + stop = 1 + + if None in shape: + LOGGER.warning("shape contains None, setting it to (1, 1)") + shape = (1, 1) + return torch.arange(start, stop).repeat(shape[0], 1) @staticmethod def generate_random_strings(shape: Tuple[int]): + if None in shape: + LOGGER.warning("shape contains None, setting it to (1, 1)") + shape = (1, 1) + return [ "".join(random.choice(string.ascii_letters + string.digits) for _ in range(shape[1])) for _ in range(shape[0]) diff --git a/optimum_benchmark/launchers/device_isolation_utils.py b/optimum_benchmark/launchers/device_isolation_utils.py index 9264def5..11d0ec7e 100644 --- a/optimum_benchmark/launchers/device_isolation_utils.py +++ b/optimum_benchmark/launchers/device_isolation_utils.py @@ -1,5 +1,6 @@ import os import signal +import sys import time from logging import getLogger from typing import Set @@ -29,7 +30,8 @@ def isolation_error_signal_handler(signum, frame): raise DeviceIsolationError("Received an error signal from the device isolation process") -signal.signal(signal.SIGUSR1, isolation_error_signal_handler) +if sys.platform == "linux": + signal.signal(signal.SIGUSR1, isolation_error_signal_handler) def get_nvidia_devices_pids(device_ids: str) -> Set[int]: @@ -157,7 +159,11 @@ def assert_device_isolation(pid: int, device_ids: str, action: str): LOGGER.warn("Make sure no other process is running on the device(s) while benchmarking.") elif action == "error": LOGGER.error("Signaling the isolated process to error out.") - os.kill(pid, signal.SIGUSR1) + if sys.platform == "linux": + os.kill(pid, signal.SIGUSR1) + else: + LOGGER.error("Sending an error signal is only supported on Linux. Killing the isolated process.") + os.kill(pid, signal.SIGKILL) elif action == "kill": LOGGER.error("Killing the isolated process.") os.kill(pid, signal.SIGKILL) diff --git a/tests/test_cli.py b/tests/test_cli.py index 806eedfa..e1bf67fb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,5 +1,7 @@ import os +import sys from logging import getLogger +from pathlib import Path import pytest @@ -7,9 +9,8 @@ LOGGER = getLogger("test") - FORCE_SERIAL = os.environ.get("FORCE_SERIAL", "0") == "1" -TEST_CONFIG_DIR = "/".join(__file__.split("/")[:-1] + ["configs"]) +TEST_CONFIG_DIR = Path(__file__).parent / "configs" TEST_CONFIG_NAMES = [ config.split(".")[0] for config in os.listdir(TEST_CONFIG_DIR) @@ -62,6 +63,9 @@ def test_cli_exit_code_0(launcher): @pytest.mark.parametrize("launcher", ["inline", "process", "torchrun"]) def test_cli_exit_code_1(launcher): + if launcher == "torchrun" and sys.platform != "linux": + pytest.skip("torchrun is only supported on Linux") + args_1 = [ "optimum-benchmark", "--config-dir", @@ -81,6 +85,9 @@ def test_cli_exit_code_1(launcher): def test_cli_numactl(): + if sys.platform != "linux": + pytest.skip("numactl is only supported on Linux") + args = [ "optimum-benchmark", "--config-dir",