Merge branch 'main' into llama_cpp

huggingface · Jul 22, 2024 · b2e14aa · b2e14aa
2 parents 9441985 + 9337b86
commit b2e14aa
Show file tree

Hide file tree

Showing 43 changed files with 1,155 additions and 878 deletions.
diff --git a/.github/workflows/test_cli_cuda_vllm.yaml → ...kflows/test_cli_cuda_vllm_single_gpu.yaml b/.github/workflows/test_cli_cuda_vllm.yaml → ...kflows/test_cli_cuda_vllm_single_gpu.yaml
@@ -50,4 +50,4 @@ jobs:
           run: |
             pip install packaging
             pip install -e .[testing,vllm,flash-attn]
-            pytest -x -s -k "cli and cuda and vllm"
+            FORCE_SERIAL=1 pytest -x -s -k "cli and cuda and vllm"
diff --git a/README.md b/README.md
@@ -14,7 +14,8 @@ Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices
 *News* 📰
 
 - 🥳 PyPI package is now available for installation: `pip install optimum-benchmark` 🎉 [check it out](https://pypi.org/project/optimum-benchmark/) !
-- numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs 🧠
+- Model loading latency/memory/energy tracking for all backends in the inference scenario 🚀
+- numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs.
 - 4 minimal docker images (`cpu`, `cuda`, `rocm`, `cuda-ort`) in [packages](https://github.com/huggingface/optimum-benchmark/pkgs/container/optimum-benchmark) for testing, benchmarking and reproducibility 🐳
 - vLLM backend for benchmarking [vLLM](https://github.com/vllm-project/vllm)'s inference engine 🚀
 - Hosting the codebase of the [LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) 🥇

diff --git a/examples/llama_cpp_mps.yaml b/examples/llama_cpp_mps.yaml
@@ -0,0 +1,25 @@
+defaults:
+  - benchmark
+  - scenario: inference
+  - launcher: inline
+  - backend: llama_cpp
+  - _base_
+  - _self_
+
+name: llama_cpp_llama
+
+backend:
+  device: mps
+  model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
+  task: text-generation
+  filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf
+
+
+scenario:
+  input_shapes:
+    batch_size: 1
+    sequence_length: 256
+    vocab_size: 32000
+  generate_kwargs:
+    max_new_tokens: 100
+    min_new_tokens: 100
diff --git a/examples/llama_mps.yaml b/examples/llama_mps.yaml
@@ -0,0 +1,23 @@
+defaults:
+  - benchmark
+  - scenario: inference
+  - launcher: inline
+  - backend: pytorch
+  - _base_
+  - _self_
+
+name: llama_tiny_mps
+
+backend:
+  device: mps
+  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  task: text-generation
+
+scenario:
+  input_shapes:
+    batch_size: 4
+    sequence_length: 256
+    vocab_size: 32000
+  generate_kwargs:
+    max_new_tokens: 100
+    min_new_tokens: 100
diff --git a/examples/pytorch_llama.py b/examples/pytorch_llama.py
@@ -0,0 +1,63 @@
+import os
+
+from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+BENCHMARK_NAME = "pytorch-llama"
+
+WEIGHTS_CONFIGS = {
+    "float16": {
+        "torch_dtype": "float16",
+        "quantization_scheme": None,
+        "quantization_config": {},
+    },
+    # "4bit-awq-gemm": {
+    #     "torch_dtype": "float16",
+    #     "quantization_scheme": "awq",
+    #     "quantization_config": {"bits": 4, "version": "gemm"},
+    # },
+    # "4bit-gptq-exllama-v2": {
+    #     "torch_dtype": "float16",
+    #     "quantization_scheme": "gptq",
+    #     "quantization_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
+    # },
+}
+
+
+def run_benchmark(weight_config: str):
+    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn")
+    backend_config = PyTorchConfig(
+        device="cuda",
+        device_ids="0",
+        no_weights=True,
+        model="gpt2",
+        **WEIGHTS_CONFIGS[weight_config],
+    )
+    scenario_config = InferenceConfig(
+        memory=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes={"batch_size": 1, "sequence_length": 128},
+        generate_kwargs={"max_new_tokens": 32, "min_new_tokens": 32},
+    )
+
+    benchmark_config = BenchmarkConfig(
+        name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config
+    )
+    benchmark_report = Benchmark.launch(benchmark_config)
+    benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+
+    filename = f"{BENCHMARK_NAME}-{backend_config.version}-{weight_config}.json"
+    benchmark.push_to_hub(repo_id="optimum-benchmark/pytorch-llama", filename=filename)
+    benchmark.save_json(path=f"benchmarks/{filename}")
+
+
+if __name__ == "__main__":
+    level = os.environ.get("LOG_LEVEL", "INFO")
+    to_file = os.environ.get("LOG_TO_FILE", "0") == "1"
+    setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS")
+
+    for weight_config in WEIGHTS_CONFIGS:
+        run_benchmark(weight_config)
diff --git a/examples/pytorch_llama_awq.yaml → examples/pytorch_llama.yaml b/examples/pytorch_llama_awq.yaml → examples/pytorch_llama.yaml
@@ -3,24 +3,31 @@ defaults:
   - scenario: inference
   - launcher: process
   - backend: pytorch
+  - _base_
   - _self_
 
-experiment_name: pytorch_llama_awq
+name: pytorch_llama
 
 launcher:
   device_isolation: true
   device_isolation_action: warn
 
 backend:
+  model: gpt2
   device: cuda
-  device_ids: 0
-  no_weights: true
-  model: TheBloke/Llama-2-70B-AWQ
+  torch_dtype: float16
+
+scenario:
+  memory: true
+  latency: true
+
+  warmup_runs: 10
+  iterations: 10
+  duration: 10
 
-benchmark:
   input_shapes:
     batch_size: 1
-    sequence_length: 128
+    sequence_length: 256
   generate_kwargs:
-    max_new_tokens: 100
-    min_new_tokens: 100
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/pytorch_llama_awq.py b/examples/pytorch_llama_awq.py
diff --git a/examples/vllm_llama.yaml b/examples/vllm_llama.yaml
@@ -14,9 +14,12 @@ launcher:
 
 backend:
   device: cuda
-  device_ids: 2
-  no_weights: true
+  device_ids: 0
+  no_weights: false
+  serving_mode: offline
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  engine_args:
+    enforce_eager: true
 
 scenario:
   input_shapes:

diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
@@ -1,24 +1,34 @@
+import os
 from abc import ABC
 from collections import OrderedDict
 from logging import getLogger
-from typing import Any, ClassVar, Dict, Generic, Optional, Tuple
+from typing import Any, ClassVar, Dict, Generic, Optional
 
 import datasets.utils.logging as datasets_logging
 import transformers.utils.logging as transformers_logging
+from safetensors.torch import save_file
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState, set_seed
 
-from ..task_utils import get_automodel_class_for_task
+from ..import_utils import is_torch_available
 from .config import BackendConfigT
-from .diffusers_utils import extract_diffusers_shapes_from_model, get_diffusers_pretrained_config
-from .timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config
+from .diffusers_utils import (
+    extract_diffusers_shapes_from_model,
+    get_diffusers_automodel_loader_for_task,
+    get_diffusers_pretrained_config,
+)
+from .timm_utils import extract_timm_shapes_from_config, get_timm_automodel_loader, get_timm_pretrained_config
 from .transformers_utils import (
     PretrainedProcessor,
     extract_transformers_shapes_from_artifacts,
+    get_transformers_automodel_loader_for_task,
     get_transformers_generation_config,
     get_transformers_pretrained_config,
     get_transformers_pretrained_processor,
 )
 
+if is_torch_available():
+    import torch
+
 datasets_logging.set_verbosity_error()
 transformers_logging.set_verbosity_error()
 
@@ -47,15 +57,15 @@ def __init__(self, config: BackendConfigT):
             self.logger.info("\t+ Benchmarking a Diffusers pipeline")
             self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.model_kwargs)
             self.model_shapes = extract_diffusers_shapes_from_model(self.config.model, **self.config.model_kwargs)
-            self.model_type = self.config.task
+            self.automodel_loader = get_diffusers_automodel_loader_for_task(self.config.task)
             self.pretrained_processor = None
             self.generation_config = None
 
         elif self.config.library == "timm":
             self.logger.info("\t+ Benchmarking a Timm model")
             self.pretrained_config = get_timm_pretrained_config(self.config.model)
             self.model_shapes = extract_timm_shapes_from_config(self.pretrained_config)
-            self.model_type = self.pretrained_config.architecture
+            self.automodel_loader = get_timm_automodel_loader()
             self.pretrained_processor = None
             self.generation_config = None
         elif self.config.library == "llama_cpp":
@@ -75,31 +85,42 @@ def __init__(self, config: BackendConfigT):
             self.model_shapes = extract_transformers_shapes_from_artifacts(
                 self.pretrained_config, self.pretrained_processor
             )
-            self.model_type = self.pretrained_config.model_type
-
-        self.automodel_class = get_automodel_class_for_task(
-            model_type=self.model_type, library=self.config.library, task=self.config.task, framework="pt"
-        )
-        self.logger.info(f"\t+ Using automodel class {self.automodel_class.__name__}")
+            self.automodel_loader = get_transformers_automodel_loader_for_task(self.config.task)
 
     def seed(self) -> None:
         set_seed(self.config.seed)
 
-    def prepare_for_inference(self, **kwargs) -> None:
+    def create_no_weights_model(self) -> None:
+        if self.pretrained_config is None:
+            raise ValueError("Can't create no weights model without a pretrained config")
+
+        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
+        self.logger.info("\t+ Creating no weights model's directory")
+        os.makedirs(self.no_weights_model, exist_ok=True)
+        self.logger.info("\t+ Creating no weights model's state dict")
+        state_dict = torch.nn.Linear(1, 1).state_dict()
+        self.logger.info("\t+ Saving no weights model's safetensors")
+        safetensors = os.path.join(self.no_weights_model, "model.safetensors")
+        save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"})
+        self.logger.info("\t+ Saving no weights model's config")
+        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+
+    def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
         """
-        This method is used to prepare the model for inference.
-        It can be used to compile the model with certain input/output shapes, for example.
+        This method is used to prepare and register the input shapes before using them by the model.
+        It can be used to pad the inputs to the correct shape, or compile it to the correct format.
         """
-        pass
+        return input_shapes
 
-    def prepare_inputs(
-        self, inputs: Dict[str, Any], input_shapes: Dict[str, Any]
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """
-        This method is used to prepare the inputs before passing them to the model.
-        It can be used to move the inputs to the correct device, for example.
+        This method is used to prepare and register the inputs before passing them to the model.
+        It can be used to move the inputs to the correct device, or rename their keys.
         """
-        return inputs, input_shapes
+        return inputs
+
+    def load(self) -> None:
+        raise NotImplementedError("Backend must implement load method")
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         """

diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
@@ -7,7 +7,11 @@
 from psutil import cpu_count
 
 from ..system_utils import get_gpu_device_ids, is_nvidia_system, is_rocm_system
-from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path
+from ..task_utils import (
+    infer_library_from_model_name_or_path,
+    infer_model_type_from_model_name_or_path,
+    infer_task_from_model_name_or_path,
+)
 
 LOGGER = getLogger("backend")
 
@@ -20,15 +24,16 @@ class BackendConfig(ABC):
 
     task: Optional[str] = None
     library: Optional[str] = None
+    model_type: Optional[str] = None
 
     model: Optional[str] = None
     processor: Optional[str] = None
 
     device: Optional[str] = None
-    device_ids: Optional[str] = None
-    # yes we use a string here instead of a list
+    # we use a string here instead of a list
     # because it's easier to pass in a yaml or from cli
     # and it's consistent with GPU environment variables
+    device_ids: Optional[str] = None
 
     seed: int = 42
     inter_op_num_threads: Optional[int] = None
@@ -39,29 +44,24 @@ class BackendConfig(ABC):
     # processor kwargs that are added to its init method/constructor
     processor_kwargs: Dict[str, Any] = field(default_factory=dict)
 
-    # deprecated
-    hub_kwargs: Dict[str, Any] = field(default_factory=dict)
-
     def __post_init__(self):
         if self.model is None:
             raise ValueError("`model` must be specified.")
 
         if self.processor is None:
             self.processor = self.model
 
-        if self.hub_kwargs:
-            LOGGER.warning(
-                "`hub_kwargs` is deprecated and will be removed in future versions."
-                "Please use `model_kwargs` and `processor_kwargs` seperately."
-            )
-            self.model_kwargs = {**self.model_kwargs, **self.hub_kwargs}
-            self.processor_kwargs = {**self.processor_kwargs, **self.hub_kwargs}
-
+        # TODO: add cache_dir, token, etc. to these methods
         if self.task is None:
-            self.task = infer_task_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None))
+            self.task = infer_task_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None))
 
         if self.library is None:
-            self.library = infer_library_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None))
+            self.library = infer_library_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None))
+
+        if self.model_type is None:
+            self.model_type = infer_model_type_from_model_name_or_path(
+                self.model, self.model_kwargs.get("revision", None)
+            )
 
         if self.device is None:
             self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu"