Skip to content

Commit

Permalink
Merge branch 'main' into llama_cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
baptistecolle committed Jul 22, 2024
2 parents 9441985 + 9337b86 commit b2e14aa
Show file tree
Hide file tree
Showing 43 changed files with 1,155 additions and 878 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ jobs:
run: |
pip install packaging
pip install -e .[testing,vllm,flash-attn]
pytest -x -s -k "cli and cuda and vllm"
FORCE_SERIAL=1 pytest -x -s -k "cli and cuda and vllm"
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices
*News* 📰

- 🥳 PyPI package is now available for installation: `pip install optimum-benchmark` 🎉 [check it out](https://pypi.org/project/optimum-benchmark/) !
- numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs 🧠
- Model loading latency/memory/energy tracking for all backends in the inference scenario 🚀
- numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs.
- 4 minimal docker images (`cpu`, `cuda`, `rocm`, `cuda-ort`) in [packages](https://github.com/huggingface/optimum-benchmark/pkgs/container/optimum-benchmark) for testing, benchmarking and reproducibility 🐳
- vLLM backend for benchmarking [vLLM](https://github.com/vllm-project/vllm)'s inference engine 🚀
- Hosting the codebase of the [LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) 🥇
Expand Down
25 changes: 25 additions & 0 deletions examples/llama_cpp_mps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
defaults:
- benchmark
- scenario: inference
- launcher: inline
- backend: llama_cpp
- _base_
- _self_

name: llama_cpp_llama

backend:
device: mps
model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
task: text-generation
filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf


scenario:
input_shapes:
batch_size: 1
sequence_length: 256
vocab_size: 32000
generate_kwargs:
max_new_tokens: 100
min_new_tokens: 100
23 changes: 23 additions & 0 deletions examples/llama_mps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
defaults:
- benchmark
- scenario: inference
- launcher: inline
- backend: pytorch
- _base_
- _self_

name: llama_tiny_mps

backend:
device: mps
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
task: text-generation

scenario:
input_shapes:
batch_size: 4
sequence_length: 256
vocab_size: 32000
generate_kwargs:
max_new_tokens: 100
min_new_tokens: 100
63 changes: 63 additions & 0 deletions examples/pytorch_llama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os

from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
from optimum_benchmark.logging_utils import setup_logging

BENCHMARK_NAME = "pytorch-llama"

WEIGHTS_CONFIGS = {
"float16": {
"torch_dtype": "float16",
"quantization_scheme": None,
"quantization_config": {},
},
# "4bit-awq-gemm": {
# "torch_dtype": "float16",
# "quantization_scheme": "awq",
# "quantization_config": {"bits": 4, "version": "gemm"},
# },
# "4bit-gptq-exllama-v2": {
# "torch_dtype": "float16",
# "quantization_scheme": "gptq",
# "quantization_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
# },
}


def run_benchmark(weight_config: str):
launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn")
backend_config = PyTorchConfig(
device="cuda",
device_ids="0",
no_weights=True,
model="gpt2",
**WEIGHTS_CONFIGS[weight_config],
)
scenario_config = InferenceConfig(
memory=True,
latency=True,
duration=10,
iterations=10,
warmup_runs=10,
input_shapes={"batch_size": 1, "sequence_length": 128},
generate_kwargs={"max_new_tokens": 32, "min_new_tokens": 32},
)

benchmark_config = BenchmarkConfig(
name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config
)
benchmark_report = Benchmark.launch(benchmark_config)
benchmark = Benchmark(config=benchmark_config, report=benchmark_report)

filename = f"{BENCHMARK_NAME}-{backend_config.version}-{weight_config}.json"
benchmark.push_to_hub(repo_id="optimum-benchmark/pytorch-llama", filename=filename)
benchmark.save_json(path=f"benchmarks/{filename}")


if __name__ == "__main__":
level = os.environ.get("LOG_LEVEL", "INFO")
to_file = os.environ.get("LOG_TO_FILE", "0") == "1"
setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS")

for weight_config in WEIGHTS_CONFIGS:
run_benchmark(weight_config)
23 changes: 15 additions & 8 deletions examples/pytorch_llama_awq.yaml → examples/pytorch_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,31 @@ defaults:
- scenario: inference
- launcher: process
- backend: pytorch
- _base_
- _self_

experiment_name: pytorch_llama_awq
name: pytorch_llama

launcher:
device_isolation: true
device_isolation_action: warn

backend:
model: gpt2
device: cuda
device_ids: 0
no_weights: true
model: TheBloke/Llama-2-70B-AWQ
torch_dtype: float16

scenario:
memory: true
latency: true

warmup_runs: 10
iterations: 10
duration: 10

benchmark:
input_shapes:
batch_size: 1
sequence_length: 128
sequence_length: 256
generate_kwargs:
max_new_tokens: 100
min_new_tokens: 100
max_new_tokens: 32
min_new_tokens: 32
28 changes: 0 additions & 28 deletions examples/pytorch_llama_awq.py

This file was deleted.

7 changes: 5 additions & 2 deletions examples/vllm_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@ launcher:

backend:
device: cuda
device_ids: 2
no_weights: true
device_ids: 0
no_weights: false
serving_mode: offline
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
engine_args:
enforce_eager: true

scenario:
input_shapes:
Expand Down
65 changes: 43 additions & 22 deletions optimum_benchmark/backends/base.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,34 @@
import os
from abc import ABC
from collections import OrderedDict
from logging import getLogger
from typing import Any, ClassVar, Dict, Generic, Optional, Tuple
from typing import Any, ClassVar, Dict, Generic, Optional

import datasets.utils.logging as datasets_logging
import transformers.utils.logging as transformers_logging
from safetensors.torch import save_file
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState, set_seed

from ..task_utils import get_automodel_class_for_task
from ..import_utils import is_torch_available
from .config import BackendConfigT
from .diffusers_utils import extract_diffusers_shapes_from_model, get_diffusers_pretrained_config
from .timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config
from .diffusers_utils import (
extract_diffusers_shapes_from_model,
get_diffusers_automodel_loader_for_task,
get_diffusers_pretrained_config,
)
from .timm_utils import extract_timm_shapes_from_config, get_timm_automodel_loader, get_timm_pretrained_config
from .transformers_utils import (
PretrainedProcessor,
extract_transformers_shapes_from_artifacts,
get_transformers_automodel_loader_for_task,
get_transformers_generation_config,
get_transformers_pretrained_config,
get_transformers_pretrained_processor,
)

if is_torch_available():
import torch

datasets_logging.set_verbosity_error()
transformers_logging.set_verbosity_error()

Expand Down Expand Up @@ -47,15 +57,15 @@ def __init__(self, config: BackendConfigT):
self.logger.info("\t+ Benchmarking a Diffusers pipeline")
self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.model_kwargs)
self.model_shapes = extract_diffusers_shapes_from_model(self.config.model, **self.config.model_kwargs)
self.model_type = self.config.task
self.automodel_loader = get_diffusers_automodel_loader_for_task(self.config.task)
self.pretrained_processor = None
self.generation_config = None

elif self.config.library == "timm":
self.logger.info("\t+ Benchmarking a Timm model")
self.pretrained_config = get_timm_pretrained_config(self.config.model)
self.model_shapes = extract_timm_shapes_from_config(self.pretrained_config)
self.model_type = self.pretrained_config.architecture
self.automodel_loader = get_timm_automodel_loader()
self.pretrained_processor = None
self.generation_config = None
elif self.config.library == "llama_cpp":
Expand All @@ -75,31 +85,42 @@ def __init__(self, config: BackendConfigT):
self.model_shapes = extract_transformers_shapes_from_artifacts(
self.pretrained_config, self.pretrained_processor
)
self.model_type = self.pretrained_config.model_type

self.automodel_class = get_automodel_class_for_task(
model_type=self.model_type, library=self.config.library, task=self.config.task, framework="pt"
)
self.logger.info(f"\t+ Using automodel class {self.automodel_class.__name__}")
self.automodel_loader = get_transformers_automodel_loader_for_task(self.config.task)

def seed(self) -> None:
set_seed(self.config.seed)

def prepare_for_inference(self, **kwargs) -> None:
def create_no_weights_model(self) -> None:
if self.pretrained_config is None:
raise ValueError("Can't create no weights model without a pretrained config")

self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
self.logger.info("\t+ Creating no weights model's directory")
os.makedirs(self.no_weights_model, exist_ok=True)
self.logger.info("\t+ Creating no weights model's state dict")
state_dict = torch.nn.Linear(1, 1).state_dict()
self.logger.info("\t+ Saving no weights model's safetensors")
safetensors = os.path.join(self.no_weights_model, "model.safetensors")
save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"})
self.logger.info("\t+ Saving no weights model's config")
self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)

def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
"""
This method is used to prepare the model for inference.
It can be used to compile the model with certain input/output shapes, for example.
This method is used to prepare and register the input shapes before using them by the model.
It can be used to pad the inputs to the correct shape, or compile it to the correct format.
"""
pass
return input_shapes

def prepare_inputs(
self, inputs: Dict[str, Any], input_shapes: Dict[str, Any]
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""
This method is used to prepare the inputs before passing them to the model.
It can be used to move the inputs to the correct device, for example.
This method is used to prepare and register the inputs before passing them to the model.
It can be used to move the inputs to the correct device, or rename their keys.
"""
return inputs, input_shapes
return inputs

def load(self) -> None:
raise NotImplementedError("Backend must implement load method")

def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
"""
Expand Down
32 changes: 16 additions & 16 deletions optimum_benchmark/backends/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
from psutil import cpu_count

from ..system_utils import get_gpu_device_ids, is_nvidia_system, is_rocm_system
from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path
from ..task_utils import (
infer_library_from_model_name_or_path,
infer_model_type_from_model_name_or_path,
infer_task_from_model_name_or_path,
)

LOGGER = getLogger("backend")

Expand All @@ -20,15 +24,16 @@ class BackendConfig(ABC):

task: Optional[str] = None
library: Optional[str] = None
model_type: Optional[str] = None

model: Optional[str] = None
processor: Optional[str] = None

device: Optional[str] = None
device_ids: Optional[str] = None
# yes we use a string here instead of a list
# we use a string here instead of a list
# because it's easier to pass in a yaml or from cli
# and it's consistent with GPU environment variables
device_ids: Optional[str] = None

seed: int = 42
inter_op_num_threads: Optional[int] = None
Expand All @@ -39,29 +44,24 @@ class BackendConfig(ABC):
# processor kwargs that are added to its init method/constructor
processor_kwargs: Dict[str, Any] = field(default_factory=dict)

# deprecated
hub_kwargs: Dict[str, Any] = field(default_factory=dict)

def __post_init__(self):
if self.model is None:
raise ValueError("`model` must be specified.")

if self.processor is None:
self.processor = self.model

if self.hub_kwargs:
LOGGER.warning(
"`hub_kwargs` is deprecated and will be removed in future versions."
"Please use `model_kwargs` and `processor_kwargs` seperately."
)
self.model_kwargs = {**self.model_kwargs, **self.hub_kwargs}
self.processor_kwargs = {**self.processor_kwargs, **self.hub_kwargs}

# TODO: add cache_dir, token, etc. to these methods
if self.task is None:
self.task = infer_task_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None))
self.task = infer_task_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None))

if self.library is None:
self.library = infer_library_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None))
self.library = infer_library_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None))

if self.model_type is None:
self.model_type = infer_model_type_from_model_name_or_path(
self.model, self.model_kwargs.get("revision", None)
)

if self.device is None:
self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu"
Expand Down
Loading

0 comments on commit b2e14aa

Please sign in to comment.