diff --git a/.github/workflows/test_cli_llama_cpp.yaml b/.github/workflows/test_cli_llama_cpp.yaml new file mode 100644 index 00000000..8e3e583d --- /dev/null +++ b/.github/workflows/test_cli_llama_cpp.yaml @@ -0,0 +1,48 @@ +name: CLI Llama.cpp Tests + +on: + workflow_dispatch: + push: + branches: + - main + paths: + - .github/workflows/test_cli_llama_cpp.yaml + - "optimum_benchmark/**" + - "docker/**" + - "tests/**" + - "setup.py" + pull_request: + branches: + - main + paths: + - .github/workflows/test_cli_llama_cpp.yaml + - "optimum_benchmark/**" + - "docker/**" + - "tests/**" + - "setup.py" + +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + +jobs: + run_cli_llama_cpp_tests: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + + - name: Install requirements + run: | + pip install --upgrade pip + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install -e .[testing,lamma-cpp] + + - name: Run tests + run: pytest -s -k "llama_cpp" diff --git a/.gitignore b/.gitignore index b30407e0..f26fda31 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,7 @@ work-in-progress/ experiments/ amdsmi/ amd-* + +# Mac specific +.DS_Store +outputs/ \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4da9ba85..9fc68d05 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -48,16 +48,16 @@ If you would like to work on any of the open Issues: 6. Depending on the feature you're working on and your development environment, you can run tests locally in an isolated docker container using the [makefile](Makefile). For example, to test the CLI with CPU device and PyTorch backend, you can run the following commands: ```bash - make install_cli_cpu_pytorch_extras + make install_cli_cpu_pytorch make test_cli_cpu_pytorch ``` For a better development experience, we recommend using isolated docker containers to run tests: ```bash - make build_docker_cpu - make run_docker_cpu - make install_cli_cpu_pytorch_extras + make build_cpu_image + make run_cpu_container + make install_cli_cpu_pytorch make test_cli_cpu_pytorch ``` diff --git a/Makefile b/Makefile index 9ef27918..39c011fb 100644 --- a/Makefile +++ b/Makefile @@ -173,6 +173,8 @@ test_cli_rocm_pytorch_single_gpu: pytest -s -k "cli and rocm and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)" # llm-perf +test_cli_llama_cpp: + pytest -s -k "llama_cpp" install_llm_perf_cuda_pytorch: pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon diff --git a/examples/llama_cpp_embedding.yaml b/examples/llama_cpp_embedding.yaml new file mode 100644 index 00000000..bdd86cce --- /dev/null +++ b/examples/llama_cpp_embedding.yaml @@ -0,0 +1,26 @@ +defaults: + - benchmark + - scenario: inference + - launcher: inline + - backend: llama_cpp + - _base_ + - _self_ + +name: llama_cpp_llama + +backend: + device: mps + model: nomic-ai/nomic-embed-text-v1.5-GGUF + task: feature-extraction + filename: nomic-embed-text-v1.5.Q4_0.gguf + +scenario: + input_shapes: + batch_size: 1 + sequence_length: 256 + vocab_size: 30000 + type_vocab_size: 1 + max_position_embeddings: 512 + generate_kwargs: + max_new_tokens: 100 + min_new_tokens: 100 diff --git a/examples/llama_cpp_text_generation.yaml b/examples/llama_cpp_text_generation.yaml new file mode 100644 index 00000000..96def950 --- /dev/null +++ b/examples/llama_cpp_text_generation.yaml @@ -0,0 +1,25 @@ +defaults: + - benchmark + - scenario: inference + - launcher: inline + - backend: llama_cpp + - _base_ + - _self_ + +name: llama_cpp_llama + +backend: + device: mps + model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF + task: text-generation + filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf + + +scenario: + input_shapes: + batch_size: 1 + sequence_length: 256 + vocab_size: 32000 + generate_kwargs: + max_new_tokens: 100 + min_new_tokens: 100 diff --git a/examples/pytorch_bert_mps.yaml b/examples/pytorch_bert_mps.yaml new file mode 100644 index 00000000..4d4dc6e3 --- /dev/null +++ b/examples/pytorch_bert_mps.yaml @@ -0,0 +1,26 @@ +defaults: + - benchmark + - scenario: inference + - launcher: process # launcher: inline works, + - backend: pytorch + - _base_ + - _self_ + +name: pytorch_bert + +# launcher: +# start_method: spawn + +scenario: + latency: true + memory: true + input_shapes: + batch_size: 1 + sequence_length: 128 + +backend: + device: cpu + no_weights: true + model: bert-base-uncased + + diff --git a/optimum_benchmark/__init__.py b/optimum_benchmark/__init__.py index f96c9b16..83845b28 100644 --- a/optimum_benchmark/__init__.py +++ b/optimum_benchmark/__init__.py @@ -1,6 +1,7 @@ from .backends import ( BackendConfig, INCConfig, + LlamaCppConfig, LLMSwarmConfig, ORTConfig, OVConfig, @@ -38,4 +39,5 @@ "TrainingConfig", "TRTLLMConfig", "VLLMConfig", + "LlamaCppConfig", ] diff --git a/optimum_benchmark/backends/__init__.py b/optimum_benchmark/backends/__init__.py index ab1ebf4e..e78bb2e7 100644 --- a/optimum_benchmark/backends/__init__.py +++ b/optimum_benchmark/backends/__init__.py @@ -1,4 +1,5 @@ from .config import BackendConfig +from .llama_cpp.config import LlamaCppConfig from .llm_swarm.config import LLMSwarmConfig from .neural_compressor.config import INCConfig from .onnxruntime.config import ORTConfig @@ -20,4 +21,5 @@ "LLMSwarmConfig", "BackendConfig", "VLLMConfig", + "LlamaCppConfig", ] diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index 8ae7e2cf..d1b1b374 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -68,7 +68,9 @@ def __init__(self, config: BackendConfigT): self.automodel_loader = get_timm_automodel_loader() self.pretrained_processor = None self.generation_config = None - + elif self.config.library == "llama_cpp": + self.logger.info("\t+ Benchmarking a Llama.cpp model") + self.model_shapes = {} else: self.logger.info("\t+ Benchmarking a Transformers model") self.generation_config = get_transformers_generation_config(self.config.model, **self.config.model_kwargs) diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py index 8be19c3d..28ad440b 100644 --- a/optimum_benchmark/backends/config.py +++ b/optimum_benchmark/backends/config.py @@ -53,14 +53,16 @@ def __post_init__(self): # TODO: add cache_dir, token, etc. to these methods if self.task is None: - self.task = infer_task_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None)) + self.task = infer_task_from_model_name_or_path( + self.model, self.model_kwargs.get("revision", None), self.library + ) if self.library is None: self.library = infer_library_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None)) if self.model_type is None: self.model_type = infer_model_type_from_model_name_or_path( - self.model, self.model_kwargs.get("revision", None) + self.model, self.model_kwargs.get("revision", None), self.library ) if self.device is None: @@ -90,8 +92,10 @@ def __post_init__(self): else: raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.") - if self.library not in ["transformers", "diffusers", "timm"]: - raise ValueError(f"`library` must be either `transformers`, `diffusers` or `timm`, but got {self.library}") + if self.library not in ["transformers", "diffusers", "timm", "llama_cpp"]: + raise ValueError( + f"`library` must be either `transformers`, `diffusers`, `timm` or `llama_cpp`, but got {self.library}" + ) if self.inter_op_num_threads is not None: if self.inter_op_num_threads == -1: diff --git a/optimum_benchmark/backends/llama_cpp/__init__.py b/optimum_benchmark/backends/llama_cpp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/optimum_benchmark/backends/llama_cpp/backend.py b/optimum_benchmark/backends/llama_cpp/backend.py new file mode 100644 index 00000000..60be9066 --- /dev/null +++ b/optimum_benchmark/backends/llama_cpp/backend.py @@ -0,0 +1,92 @@ +from tempfile import TemporaryDirectory +from typing import Any, Dict, Tuple + +from llama_cpp import Llama + +from ..base import Backend +from .config import LlamaCppConfig + + +class LlamaCppBackend(Backend[LlamaCppConfig]): + NAME: str = "llama_cpp" + + def __init__(self, config: LlamaCppConfig) -> None: + super().__init__(config) + + if self.config.no_weights: + self.logger.info("\t+ Loading no weights model") + raise NotImplementedError("No weights model is not yet implemented") + + def load(self) -> None: + self.logger.info("\t+ Creating backend temporary directory") + self.tmpdir = TemporaryDirectory() + self.logger.info("\t+ Loading pretrained model") + self.load_model_from_pretrained() + self.tmpdir.cleanup() + + def load_model_from_pretrained(self) -> None: + """ + Load the pretrained model from the given model name (normally GGUF, GGML) + """ + embedding = True if self.config.task == "feature-extraction" else False + + self.pretrained_model = Llama.from_pretrained( + repo_id=self.config.model, # type: ignore + filename=self.config.filename, + verbose=False, + echo=False, + embedding=embedding, + ) # type: ignore + + def validate_task(self) -> None: + if self.config.task not in ["text-generation"]: + raise ValueError(f"Task {self.config.task} not supported by {self.NAME}") + + def prepare_inputs(self, inputs: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: + if self.config.task == "text-generation": + if inputs["input_ids"].shape[0] != 1: + raise ValueError("Batch size must be 1 for Llama.cpp text generation") + + inputs = super().prepare_inputs(inputs) + inputs["tokens"] = inputs["input_ids"].squeeze() + + return inputs + elif self.config.task == "feature-extraction": + detokenized_batch = list(map(self.pretrained_model.detokenize, inputs["input_ids"])) + decoded_batch = [x.decode("utf-8") for x in detokenized_batch] + + inputs["input_str"] = decoded_batch + return inputs + + raise ValueError(f"Task {self.config.task} not supported by {self.NAME}") + + def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any: + """ + Forward pass of the model\ + Get the embeddings of the input tokens + """ + + return self.pretrained_model.embed(inputs["input_str"]) + + def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]: + """ + Prefill the model with the input tokens + We consider prefill as the time to first token, thus we evaluate the time it takes for the model to generate the first token + """ + + next(self.pretrained_model.generate(tokens=inputs["tokens"])) + return inputs + + def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]: + """ + Generate new tokens from the pretrained model + """ + + output = [] + + for token in self.pretrained_model.generate(tokens=inputs["tokens"]): + output.append(token) + if len(output) >= kwargs["max_new_tokens"]: + break + + return output diff --git a/optimum_benchmark/backends/llama_cpp/config.py b/optimum_benchmark/backends/llama_cpp/config.py new file mode 100644 index 00000000..794ea9d3 --- /dev/null +++ b/optimum_benchmark/backends/llama_cpp/config.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from logging import getLogger +from typing import Optional + +from ...import_utils import llama_cpp_version +from ..config import BackendConfig + +LOGGER = getLogger("backend") + + +def llama_cpp_model_kwargs(): + return {"verbose": True} + + +@dataclass +class LlamaCppConfig(BackendConfig): + name: str = "llama_cpp" + version: Optional[str] = llama_cpp_version() + _target_: str = "optimum_benchmark.backends.llama_cpp.backend.LlamaCppBackend" + + no_weights: bool = False + library: str = "llama_cpp" + filename: Optional[str] = None + + def __post_init__(self): + super().__post_init__() + + self.device = self.device.lower() # type: ignore + self.library = "llama_cpp" + + if self.device not in ["cuda", "mps", "cpu"]: + raise ValueError(f"Llama.cpp Backend only supports 'cpu', 'mps' and 'cuda' devices, got {self.device}") + + LOGGER.warning("Llama.cpp automatically selects the device, ignoring the device parameter in the config.") diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py index 7b44f621..57c6b054 100644 --- a/optimum_benchmark/cli.py +++ b/optimum_benchmark/cli.py @@ -13,6 +13,7 @@ INCConfig, InferenceConfig, InlineConfig, + LlamaCppConfig, LLMSwarmConfig, ORTConfig, OVConfig, @@ -44,6 +45,7 @@ cs.store(group="backend", name=PyTXIConfig.name, node=PyTXIConfig) cs.store(group="backend", name=LLMSwarmConfig.name, node=LLMSwarmConfig) cs.store(group="backend", name=VLLMConfig.name, node=VLLMConfig) +cs.store(group="backend", name=LlamaCppConfig.name, node=LlamaCppConfig) # scenarios configurations cs.store(group="scenario", name=TrainingConfig.name, node=TrainingConfig) cs.store(group="scenario", name=InferenceConfig.name, node=InferenceConfig) diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index 78e2a754..b2f88296 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -351,7 +351,12 @@ class FeatureExtractionGenerator(TextGenerator, ImageGenerator): def __call__(self): dummy = {} - if self.shapes["num_channels"] is not None and self.shapes["height"] is not None: + if ( + "num_channels" in self.shapes + and self.shapes["num_channels"] is not None + and "height" in self.shapes + and self.shapes["height"] is not None + ): dummy["pixel_values"] = self.pixel_values() else: dummy["input_ids"] = self.input_ids() diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py index 7fdff6ef..e731bc74 100644 --- a/optimum_benchmark/import_utils.py +++ b/optimum_benchmark/import_utils.py @@ -32,12 +32,17 @@ _llm_swarm_available = importlib.util.find_spec("llm_swarm") is not None _zentorch_available = importlib.util.find_spec("zentorch") is not None _vllm_available = importlib.util.find_spec("vllm") is not None +_llama_cpp_available = importlib.util.find_spec("llama-cpp-python") is not None def is_vllm_available(): return _vllm_available +def is_llama_cpp_available(): + return _llama_cpp_available + + def is_zentorch_available(): return _zentorch_available @@ -223,6 +228,11 @@ def vllm_version(): return importlib.metadata.version("vllm") +def llama_cpp_version(): + if _llama_cpp_available: + return importlib.metadata.version("llama_cpp") + + def get_git_revision_hash(package_name: str) -> Optional[str]: """ Returns the git commit SHA of a package installed from a git repository. diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py index 4587097f..f8d664b8 100644 --- a/optimum_benchmark/task_utils.py +++ b/optimum_benchmark/task_utils.py @@ -101,13 +101,20 @@ def infer_library_from_model_name_or_path(model_name_or_path: str, revision: Opt return inferred_library_name -def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str: - library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision) +def infer_task_from_model_name_or_path( + model_name_or_path: str, revision: Optional[str] = None, library_name: Optional[str] = None +) -> str: + if library_name is None: + library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision) inferred_task_name = None if library_name == "timm": inferred_task_name = "image-classification" + + if library_name == "llama_cpp": + inferred_task_name = "text-generation" + elif library_name == "sentence-transformers": inferred_task_name = "feature-extraction" elif huggingface_hub.repo_exists(model_name_or_path): @@ -168,8 +175,11 @@ def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Option return inferred_task_name -def infer_model_type_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str: - library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision) +def infer_model_type_from_model_name_or_path( + model_name_or_path: str, revision: Optional[str] = None, library_name: Optional[str] = None +) -> str: + if library_name is None: + library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision) inferred_model_type = None @@ -191,6 +201,8 @@ def infer_model_type_from_model_name_or_path(model_name_or_path: str, revision: break if inferred_model_type is not None: break + elif library_name == "llama_cpp": + inferred_model_type = "llama_cpp" else: from transformers import AutoConfig diff --git a/setup.py b/setup.py index b024a738..9580f14a 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ "llm-swarm": ["llm-swarm"], "py-txi": ["py-txi"], "vllm": ["vllm"], + "lamma-cpp": ["llama-cpp-python"], # optional dependencies "autoawq": [AUTOAWQ], "auto-gptq": ["optimum", AUTOGPTQ], @@ -113,7 +114,7 @@ "Topic :: Scientific/Engineering :: Artificial Intelligence", ], keywords="benchmaek, transformers, quantization, pruning, optimization, training, inference, onnx, onnx runtime, intel, " - "habana, graphcore, neural compressor, ipex, ipu, hpu, llm-swarm, py-txi, vllm, auto-gptq, autoawq, " + "habana, graphcore, neural compressor, ipex, ipu, hpu, llm-swarm, py-txi, vllm, llama-cpp, auto-gptq, autoawq, " "sentence-transformers, bitsandbytes, codecarbon, flash-attn, deepspeed, diffusers, timm, peft", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/tests/configs/cpu_inference_llama_cpp_embedding.yaml b/tests/configs/cpu_inference_llama_cpp_embedding.yaml new file mode 100644 index 00000000..3c3d09c0 --- /dev/null +++ b/tests/configs/cpu_inference_llama_cpp_embedding.yaml @@ -0,0 +1,23 @@ +defaults: + - _base_ + - _inference_ + - _self_ + - override backend: llama_cpp + +name: inference_llama_cpp_embedding + +backend: + model: nomic-ai/nomic-embed-text-v1.5-GGUF + task: feature-extraction + filename: nomic-embed-text-v1.5.Q4_0.gguf + +scenario: + input_shapes: + batch_size: 1 + sequence_length: 256 + vocab_size: 30000 + type_vocab_size: 1 + max_position_embeddings: 512 + generate_kwargs: + max_new_tokens: 100 + min_new_tokens: 100 \ No newline at end of file diff --git a/tests/configs/cpu_inference_llama_cpp_text_generation.yaml b/tests/configs/cpu_inference_llama_cpp_text_generation.yaml new file mode 100644 index 00000000..5c53e8ec --- /dev/null +++ b/tests/configs/cpu_inference_llama_cpp_text_generation.yaml @@ -0,0 +1,21 @@ +defaults: + - _base_ + - _inference_ + - _self_ + - override backend: llama_cpp + +name: inference_llama_cpp_text_generation + +backend: + model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF + task: text-generation + filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf + +scenario: + input_shapes: + batch_size: 1 + sequence_length: 256 + vocab_size: 32000 + generate_kwargs: + max_new_tokens: 100 + min_new_tokens: 100 \ No newline at end of file