try loading the kernels directly and provide a utility to install aut…

…oawq and autogtpq from source
huggingface · Aug 15, 2024 · 1353433 · 1353433
2 parents 0696ba4 + b58e904
commit 1353433
Show file tree

Hide file tree

Showing 28 changed files with 407 additions and 118 deletions.
diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml
@@ -49,4 +49,4 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/misc
         run: |
-          pytest -s -k "api and not (cpu or cuda)"
+          pytest -s -k "api and not (cpu or cuda or rocm or mps)"
diff --git a/.github/workflows/test_cli_cpu_llama_cpp.yaml b/.github/workflows/test_cli_cpu_llama_cpp.yaml
@@ -0,0 +1,48 @@
+name: CLI CPU LlamaCpp Tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/test_cli_llama_cpp.yaml
+      - "optimum_benchmark/**"
+      - "docker/**"
+      - "tests/**"
+      - "setup.py"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - .github/workflows/test_cli_llama_cpp.yaml
+      - "optimum_benchmark/**"
+      - "docker/**"
+      - "tests/**"
+      - "setup.py"
+
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+
+jobs:
+  run_cli_cpu_llama_cpp_tests:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install requirements
+        run: |
+          pip install --upgrade pip
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install -e .[testing,llama-cpp]
+
+      - name: Run tests
+        run: pytest -s -k "llama_cpp"
diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml
@@ -52,7 +52,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install -e .[testing]
+          pip install -e .[testing,timm,diffusers,codecarbon]
 
       - name: Run tests
-        run: pytest -s -k "cli and not (cpu or cuda)"
+        run: pytest -s -k "cli and not (cpu or cuda or rocm or mps)"
diff --git a/.gitignore b/.gitignore
@@ -173,5 +173,9 @@ experiments/
 amdsmi/
 amd-*
 
+
 external_repos/
-outputs/
+outputs/
+
+# Mac specific
+.DS_Store
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -48,7 +48,7 @@ If you would like to work on any of the open Issues:
 6. Depending on the feature you're working on and your development environment, you can run tests locally in an isolated docker container using the [makefile](Makefile). For example, to test the CLI with CPU device and PyTorch backend, you can run the following commands:
 
 	```bash
-	make install_cli_cpu_pytorch_extras
+	make install_cli_cpu_pytorch
 	make test_cli_cpu_pytorch
 	```
 

diff --git a/Makefile b/Makefile
@@ -175,6 +175,8 @@ test_cli_rocm_pytorch_single_gpu:
 	pytest -s -k "cli and rocm and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
 
 # llm-perf
+test_cli_llama_cpp:
+	pytest -s -k "llama_cpp"
 
 install_llm_perf_cuda_pytorch:
 	pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon

diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices
 
 *News* 📰
 
+- LlamaCpp backend for benchmarking [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python) bindings with all its supported devices 🚀
 - 🥳 PyPI package is now available for installation: `pip install optimum-benchmark` 🎉 [check it out](https://pypi.org/project/optimum-benchmark/) !
 - Model loading latency/memory/energy tracking for all backends in the inference scenario 🚀
 - numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs.
@@ -47,18 +48,20 @@ Optimum-Benchmark is continuously and intensively tested on a variety of devices
 
 ### CLI 📈
 
+[![CLI_CPU_LLAMA_CPP](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml)
 [![CLI_CPU_NEURAL_COMPRESSOR](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml)
 [![CLI_CPU_ONNXRUNTIME](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml)
 [![CLI_CPU_OPENVINO](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml)
 [![CLI_CPU_PYTORCH](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml)
 [![CLI_CPU_PY_TXI](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_py_txi.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_py_txi.yaml)
 [![CLI_CUDA_ONNXRUNTIME](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_onnxruntime.yaml)
-[![CLI_CUDA_VLLM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm.yaml)
 [![CLI_CUDA_PYTORCH_MULTI_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch_multi_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch_multi_gpu.yaml)
 [![CLI_CUDA_PYTORCH_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch_single_gpu.yaml)
-[![CLI_CUDA_TENSORRT_LLM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm.yaml)
+[![CLI_CUDA_PY_TXI](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_py_txi.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_py_txi.yaml)
+[![CLI_CUDA_TENSORRT_LLM_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm_single_gpu.yaml)
 [![CLI_CUDA_TORCH_ORT_MULTI_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort_multi_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort_multi_gpu.yaml)
 [![CLI_CUDA_TORCH_ORT_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort_single_gpu.yaml)
+[![CLI_CUDA_VLLM_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm_single_gpu.yaml)
 [![CLI_MISC](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml)
 [![CLI_ROCM_PYTORCH_MULTI_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch_multi_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch_multi_gpu.yaml)
 [![CLI_ROCM_PYTORCH_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch_single_gpu.yaml)

diff --git a/examples/llama_cpp_embedding.yaml b/examples/llama_cpp_embedding.yaml
@@ -0,0 +1,26 @@
+defaults:
+  - benchmark
+  - scenario: inference
+  - launcher: inline
+  - backend: llama_cpp
+  - _base_
+  - _self_
+
+name: llama_cpp_llama
+
+backend:
+  device: mps
+  model: nomic-ai/nomic-embed-text-v1.5-GGUF
+  task: feature-extraction
+  filename: nomic-embed-text-v1.5.Q4_0.gguf
+
+scenario:
+  input_shapes:
+    batch_size: 1
+    sequence_length: 256
+    vocab_size: 30000
+    type_vocab_size: 1
+    max_position_embeddings: 512
+  generate_kwargs:
+    max_new_tokens: 100
+    min_new_tokens: 100
diff --git a/examples/llama_cpp_text_generation.yaml b/examples/llama_cpp_text_generation.yaml
@@ -0,0 +1,25 @@
+defaults:
+  - benchmark
+  - scenario: inference
+  - launcher: inline
+  - backend: llama_cpp
+  - _base_
+  - _self_
+
+name: llama_cpp_llama
+
+backend:
+  device: mps
+  model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
+  task: text-generation
+  filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf
+
+
+scenario:
+  input_shapes:
+    batch_size: 1
+    sequence_length: 256
+    vocab_size: 32000
+  generate_kwargs:
+    max_new_tokens: 100
+    min_new_tokens: 100
diff --git a/examples/pytorch_bert_mps.yaml b/examples/pytorch_bert_mps.yaml
@@ -0,0 +1,26 @@
+defaults:
+  - benchmark
+  - scenario: inference
+  - launcher: process # launcher: inline works, 
+  - backend: pytorch
+  - _base_
+  - _self_
+
+name: pytorch_bert
+
+# launcher:
+#   start_method: spawn
+
+scenario:
+  latency: true
+  memory: true
+  input_shapes:
+    batch_size: 1
+    sequence_length: 128
+
+backend:
+  device: cpu
+  no_weights: true
+  model: bert-base-uncased
+
+
diff --git a/optimum_benchmark/__init__.py b/optimum_benchmark/__init__.py
@@ -1,6 +1,7 @@
 from .backends import (
     BackendConfig,
     INCConfig,
+    LlamaCppConfig,
     LLMSwarmConfig,
     ORTConfig,
     OVConfig,
@@ -38,4 +39,5 @@
     "TrainingConfig",
     "TRTLLMConfig",
     "VLLMConfig",
+    "LlamaCppConfig",
 ]
diff --git a/optimum_benchmark/backends/__init__.py b/optimum_benchmark/backends/__init__.py
@@ -1,4 +1,5 @@
 from .config import BackendConfig
+from .llama_cpp.config import LlamaCppConfig
 from .llm_swarm.config import LLMSwarmConfig
 from .neural_compressor.config import INCConfig
 from .onnxruntime.config import ORTConfig
@@ -20,4 +21,5 @@
     "LLMSwarmConfig",
     "BackendConfig",
     "VLLMConfig",
+    "LlamaCppConfig",
 ]
diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
@@ -69,17 +69,28 @@ def __init__(self, config: BackendConfigT):
             self.pretrained_processor = None
             self.generation_config = None
 
+        elif self.config.library == "llama_cpp":
+            self.logger.info("\t+ Benchmarking a LlamaCpp model")
+            self.pretrained_processor = None
+            self.generation_config = None
+            self.pretrained_config = None
+            self.automodel_loader = None
+            # TOD: need a custom method to extract shapes from gguf
+            self.model_shapes = extract_transformers_shapes_from_artifacts(
+                self.pretrained_config, self.pretrained_processor
+            )
+
         else:
             self.logger.info("\t+ Benchmarking a Transformers model")
             self.generation_config = get_transformers_generation_config(self.config.model, **self.config.model_kwargs)
             self.pretrained_config = get_transformers_pretrained_config(self.config.model, **self.config.model_kwargs)
+            self.automodel_loader = get_transformers_automodel_loader_for_task(self.config.task)
             self.pretrained_processor = get_transformers_pretrained_processor(
                 self.config.processor, **self.config.processor_kwargs
             )
             self.model_shapes = extract_transformers_shapes_from_artifacts(
                 self.pretrained_config, self.pretrained_processor
             )
-            self.automodel_loader = get_transformers_automodel_loader_for_task(self.config.task)
 
     def seed(self) -> None:
         set_seed(self.config.seed)

diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
@@ -52,15 +52,27 @@ def __post_init__(self):
             self.processor = self.model
 
         # TODO: add cache_dir, token, etc. to these methods
-        if self.task is None:
-            self.task = infer_task_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None))
-
         if self.library is None:
-            self.library = infer_library_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None))
+            self.library = infer_library_from_model_name_or_path(
+                self.model,
+                revision=self.model_kwargs.get("revision", None),
+                token=self.model_kwargs.get("token", None),
+            )
+
+        if self.task is None:
+            self.task = infer_task_from_model_name_or_path(
+                self.model,
+                self.library,
+                revision=self.model_kwargs.get("revision", None),
+                token=self.model_kwargs.get("token", None),
+            )
 
         if self.model_type is None:
             self.model_type = infer_model_type_from_model_name_or_path(
-                self.model, self.model_kwargs.get("revision", None)
+                self.model,
+                self.library,
+                revision=self.model_kwargs.get("revision", None),
+                token=self.model_kwargs.get("token", None),
             )
 
         if self.device is None:
@@ -90,8 +102,10 @@ def __post_init__(self):
             else:
                 raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.")
 
-        if self.library not in ["transformers", "diffusers", "timm"]:
-            raise ValueError(f"`library` must be either `transformers`, `diffusers` or `timm`, but got {self.library}")
+        if self.library not in ["transformers", "diffusers", "timm", "llama_cpp"]:
+            raise ValueError(
+                f"`library` must be either `transformers`, `diffusers`, `timm` or `llama_cpp`, but got {self.library}"
+            )
 
         if self.inter_op_num_threads is not None:
             if self.inter_op_num_threads == -1:

diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Dict
 
 from hydra.utils import get_class
@@ -38,7 +39,9 @@
 
 
 def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]:
-    return DiffusionPipeline.load_config(model, **kwargs)
+    config = DiffusionPipeline.load_config(model, **kwargs)
+    pipeline_config = config[0] if isinstance(config, tuple) else config
+    return pipeline_config
 
 
 def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]:
@@ -62,6 +65,7 @@ def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]:
         shapes["width"] = vae_config["sample_size"]
 
     else:
+        warnings.warn("Could not extract shapes [num_channels, height, width] from diffusion pipeline.")
         shapes["num_channels"] = -1
         shapes["height"] = -1
         shapes["width"] = -1

diff --git a/optimum_benchmark/backends/llama_cpp/__init__.py b/optimum_benchmark/backends/llama_cpp/__init__.py