Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add llama.cpp backend #231

Merged
merged 15 commits into from
Jul 30, 2024
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,6 @@ work-in-progress/
experiments/
amdsmi/
amd-*

# Mac specific
.DS_Store
6 changes: 3 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,16 @@ If you would like to work on any of the open Issues:
6. Depending on the feature you're working on and your development environment, you can run tests locally in an isolated docker container using the [makefile](Makefile). For example, to test the CLI with CPU device and PyTorch backend, you can run the following commands:

```bash
make install_cli_cpu_pytorch_extras
make install_cli_cpu_pytorch
make test_cli_cpu_pytorch
```

For a better development experience, we recommend using isolated docker containers to run tests:

```bash
make build_docker_cpu
make build_cpu_image
make run_docker_cpu
make install_cli_cpu_pytorch_extras
make install_cli_cpu_pytorch
make test_cli_cpu_pytorch
```

Expand Down
25 changes: 25 additions & 0 deletions examples/llama_cpp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
defaults:
- benchmark
- scenario: inference
- launcher: inline
- backend: llama_cpp
- _base_
- _self_

name: llama_cpp_llama_v2

backend:
device: mps
model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
task: text-generation
filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf


scenario:
input_shapes:
batch_size: 1
sequence_length: 256
vocab_size: 32000
generate_kwargs:
max_new_tokens: 100
min_new_tokens: 100
25 changes: 25 additions & 0 deletions examples/llama_cpp_mps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
defaults:
- benchmark
- scenario: inference
- launcher: inline
- backend: llama_cpp
- _base_
- _self_

name: llama_cpp_llama

backend:
device: mps
model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
task: text-generation
filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf


scenario:
input_shapes:
batch_size: 1
sequence_length: 256
vocab_size: 32000
generate_kwargs:
max_new_tokens: 100
min_new_tokens: 100
23 changes: 23 additions & 0 deletions examples/llama_mps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
defaults:
- benchmark
- scenario: inference
- launcher: inline
- backend: pytorch
- _base_
- _self_

name: llama_tiny_mps

backend:
device: mps
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
task: text-generation

scenario:
input_shapes:
batch_size: 4
sequence_length: 256
vocab_size: 32000
generate_kwargs:
max_new_tokens: 100
min_new_tokens: 100
23 changes: 23 additions & 0 deletions examples/llama_tiny_mps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
defaults:
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved
- benchmark
- scenario: inference
- launcher: inline
- backend: pytorch
- _base_
- _self_

name: llama_tiny_mps

backend:
device: mps
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
task: text-generation

scenario:
input_shapes:
batch_size: 4
sequence_length: 256
vocab_size: 32000
generate_kwargs:
max_new_tokens: 100
min_new_tokens: 100
26 changes: 26 additions & 0 deletions examples/pytorch_bert_mps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
defaults:
- benchmark
- scenario: inference
- launcher: process # launcher: inline works,
- backend: pytorch
- _base_
- _self_

name: pytorch_bert

# launcher:
# start_method: spawn

scenario:
latency: true
memory: true
input_shapes:
batch_size: 1
sequence_length: 128

backend:
device: cpu
no_weights: true
model: bert-base-uncased


2 changes: 2 additions & 0 deletions optimum_benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .backends import (
BackendConfig,
INCConfig,
LlamaCppConfig,
LLMSwarmConfig,
ORTConfig,
OVConfig,
Expand Down Expand Up @@ -38,4 +39,5 @@
"TrainingConfig",
"TRTLLMConfig",
"VLLMConfig",
"LlamaCppConfig",
]
2 changes: 2 additions & 0 deletions optimum_benchmark/backends/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .config import BackendConfig
from .llama_cpp.config import LlamaCppConfig
from .llm_swarm.config import LLMSwarmConfig
from .neural_compressor.config import INCConfig
from .onnxruntime.config import ORTConfig
Expand All @@ -20,4 +21,5 @@
"LLMSwarmConfig",
"BackendConfig",
"VLLMConfig",
"LlamaCppConfig",
]
8 changes: 7 additions & 1 deletion optimum_benchmark/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,13 @@ def __init__(self, config: BackendConfigT):
self.automodel_loader = get_timm_automodel_loader()
self.pretrained_processor = None
self.generation_config = None

elif self.config.library == "llama_cpp":
self.logger.info("\t+ Benchmarking a Llama.cpp model")
self.pretrained_config = None
self.model_shapes = {}
self.model_type = self.config.task
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved
self.pretrained_processor = None
self.generation_config = None
else:
self.logger.info("\t+ Benchmarking a Transformers model")
self.generation_config = get_transformers_generation_config(self.config.model, **self.config.model_kwargs)
Expand Down
6 changes: 4 additions & 2 deletions optimum_benchmark/backends/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,10 @@ def __post_init__(self):
else:
raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.")

if self.library not in ["transformers", "diffusers", "timm"]:
raise ValueError(f"`library` must be either `transformers`, `diffusers` or `timm`, but got {self.library}")
if self.library not in ["transformers", "diffusers", "timm", "llama_cpp"]:
raise ValueError(
f"`library` must be either `transformers`, `diffusers`, `timm` or `llama_cpp`, but got {self.library}"
)

if self.inter_op_num_threads is not None:
if self.inter_op_num_threads == -1:
Expand Down
Empty file.
93 changes: 93 additions & 0 deletions optimum_benchmark/backends/llama_cpp/backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from tempfile import TemporaryDirectory
from typing import Any, Dict, Tuple

from llama_cpp import Llama

from optimum_benchmark.backends.base import Backend
from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS

from .config import LlamaCppConfig


class LlamaCppBackend(Backend[LlamaCppConfig]):
NAME: str = "llama_cpp"

def __init__(self, config: LlamaCppConfig) -> None:
super().__init__(config)

self.config.name
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved

if self.config.no_weights:
self.logger.info("\t+ Loading no weights model")
raise NotImplementedError("No weights model is not yet implemented")

self.validate_task()

self.logger.info("\t+ Creating backend temporary directory")
self.tmpdir = TemporaryDirectory()
self.logger.info("\t+ Loading pretrained model")
self.load_model_from_pretrained()
self.tmpdir.cleanup()
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved

def load(self) -> None:
self.load_model_from_pretrained()

def load_model_from_pretrained(self) -> None:
"""
Load the pretrained model from the given model name (normally GGUF, GGML)
"""

if self.config.task not in TEXT_GENERATION_TASKS:
raise NotImplementedError(f"Llama.cpp does not support task {self.config.task}")
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved

self.pretrained_model = Llama.from_pretrained(
repo_id=self.config.model, # type: ignore
filename=self.config.filename,
verbose=False,
echo=False,
) # type: ignore

def validate_task(self) -> None:
if self.config.task not in ["text-generation"]:
raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")

def prepare_inputs(self, inputs: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
if inputs["input_ids"].shape[0] != 1:
raise ValueError("Batch size must be 1 for Llama.cpp")

inputs = super().prepare_inputs(inputs)
inputs["tokens"] = inputs["input_ids"].squeeze()

return inputs

def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
"""
Forward pass of the model\
Evaluate the probabilites of the given tokens
"""
return self.pretrained_model.eval(
tokens=inputs["input_ids"],
)
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved

def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
"""
Prefill the model with the input tokens
We consider prefill as the time to first token, thus we evaluate the time it takes for the model to generate the first token
"""

next(self.pretrained_model.generate(tokens=inputs["tokens"]))
return inputs

def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
"""
Generate new tokens from the pretrained model
"""

output = []

for token in self.pretrained_model.generate(tokens=inputs["tokens"]):
output.append(token)
if len(output) >= kwargs["max_new_tokens"]:
break

return output
35 changes: 35 additions & 0 deletions optimum_benchmark/backends/llama_cpp/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from dataclasses import dataclass
from typing import Optional

from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS

from ...import_utils import llama_cpp_version
from ..config import BackendConfig


def llama_cpp_model_kwargs():
return {"verbose": True}


@dataclass
class LlamaCppConfig(BackendConfig):
name: str = "llama_cpp"
version: Optional[str] = llama_cpp_version()
_target_: str = "optimum_benchmark.backends.llama_cpp.backend.LlamaCppBackend"

no_weights: bool = False
library: str = "llama_cpp"
filename: Optional[str] = None

def __post_init__(self):
super().__post_init__()

print(self.task)
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved

if self.task not in TEXT_GENERATION_TASKS:
raise NotImplementedError(f"Llama.cpp does not support task {self.task}")
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved

self.device = self.device.lower() # type: ignore

if self.device not in ["cuda", "mps", "cpu"]:
raise ValueError(f"Llama.cpp Backend only supports 'cpu', 'mps' and 'cuda' devices, got {self.device}")
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 2 additions & 0 deletions optimum_benchmark/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
INCConfig,
InferenceConfig,
InlineConfig,
LlamaCppConfig,
LLMSwarmConfig,
ORTConfig,
OVConfig,
Expand Down Expand Up @@ -44,6 +45,7 @@
cs.store(group="backend", name=PyTXIConfig.name, node=PyTXIConfig)
cs.store(group="backend", name=LLMSwarmConfig.name, node=LLMSwarmConfig)
cs.store(group="backend", name=VLLMConfig.name, node=VLLMConfig)
cs.store(group="backend", name=LlamaCppConfig.name, node=LlamaCppConfig)
# scenarios configurations
cs.store(group="scenario", name=TrainingConfig.name, node=TrainingConfig)
cs.store(group="scenario", name=InferenceConfig.name, node=InferenceConfig)
Expand Down
12 changes: 12 additions & 0 deletions optimum_benchmark/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,17 @@
_llm_swarm_available = importlib.util.find_spec("llm_swarm") is not None
_zentorch_available = importlib.util.find_spec("zentorch") is not None
_vllm_available = importlib.util.find_spec("vllm") is not None
_llama_cpp_available = importlib.util.find_spec("llama-cpp-python") is not None


def is_vllm_available():
return _vllm_available


def is_llama_cpp_available():
return _llama_cpp_available


def is_zentorch_available():
return _zentorch_available

Expand Down Expand Up @@ -223,6 +228,11 @@ def vllm_version():
return importlib.metadata.version("vllm")


def llama_cpp_version():
if _llama_cpp_available:
return importlib.metadata.version("llama_cpp")


def get_git_revision_hash(package_name: str) -> Optional[str]:
"""
Returns the git commit SHA of a package installed from a git repository.
Expand Down Expand Up @@ -258,4 +268,6 @@ def get_hf_libs_info():
"timm_commit": get_git_revision_hash("timm"),
"peft_version": peft_version() if is_peft_available() else None,
"peft_commit": get_git_revision_hash("peft"),
"llama_cpp_version": llama_cpp_version(),
"llama_cpp_commit": get_git_revision_hash("llama_cpp"),
baptistecolle marked this conversation as resolved.
Show resolved Hide resolved
}
Loading
Loading