From 2a75c0bc0d007cc875fa0f75ca41d02e46f917be Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Wed, 3 Jul 2024 16:05:34 +0200 Subject: [PATCH 1/6] Fix per token latency (#223) --- optimum_benchmark/trackers/latency.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index 340fcc61..1f74a377 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -262,7 +262,7 @@ def __init__(self, device: str, backend: str): LOGGER.info("\t+ Tracking latency using CPU performance counter") self.start_time: Optional[float] = None - self.next_is_prefill_end_decode_start: Optional[bool] = None + self.prefilled: Optional[bool] = None self.per_token_events: List[Union[float, torch.cuda.Event]] = [] self.prefill_start_events: List[Union[float, torch.cuda.Event]] = [] @@ -272,7 +272,7 @@ def __init__(self, device: str, backend: str): def reset(self): self.start_time = None - self.next_is_prefill_end_decode_start = None + self.prefilled = None self.per_token_events = [] self.prefill_start_events = [] @@ -291,11 +291,13 @@ def track(self): else: self.prefill_start_events.append(time.perf_counter()) - self.next_is_prefill_end_decode_start = True # this is used to record the end of prefill and start of decode + self.prefilled = False - yield # this is where generate is called, and for each decoded token, we record an event + # this is where generate is called, + # and for each decoded token, we record an event + yield - self.next_is_prefill_end_decode_start = None + self.prefilled = None if self.is_asynchronous: self.decode_end_events.append(torch.cuda.Event(enable_timing=True)) @@ -308,7 +310,7 @@ def track(self): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): assert ( - self.next_is_prefill_end_decode_start is not None + self.prefilled is not None ), "PerTokenLatencyLogitsProcessor should only be called inside of track() context" if self.is_asynchronous: @@ -317,12 +319,12 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): else: event = time.perf_counter() - if self.next_is_prefill_end_decode_start: + self.per_token_events.append(event) + + if not self.prefilled: self.prefill_end_events.append(event) self.decode_start_events.append(event) - self.next_is_prefill_end_decode_start = False - else: - self.per_token_events.append(event) + self.prefilled = True return scores From 8ebe8531a5b12ea7926cab66905381c76b24fcc4 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Wed, 3 Jul 2024 16:45:43 +0200 Subject: [PATCH 2/6] Patch release (#224) --- optimum_benchmark/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum_benchmark/version.py b/optimum_benchmark/version.py index e1107dcd..d4044aeb 100644 --- a/optimum_benchmark/version.py +++ b/optimum_benchmark/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.3.0" +__version__ = "0.3.1" From 79990507b694d513bac81e140baff3af23a6bff7 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Wed, 3 Jul 2024 17:28:10 +0200 Subject: [PATCH 3/6] Per token latency outliers (#225) --- optimum_benchmark/import_utils.py | 12 ++++++------ optimum_benchmark/trackers/latency.py | 25 ++++++++++++++----------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py index 09a2a08d..7fdff6ef 100644 --- a/optimum_benchmark/import_utils.py +++ b/optimum_benchmark/import_utils.py @@ -246,16 +246,16 @@ def get_hf_libs_info(): return { "optimum_benchmark_version": optimum_benchmark_version(), "optimum_benchmark_commit": get_git_revision_hash("optimum_benchmark"), - "transformers_version": transformers_version(), + "transformers_version": transformers_version() if is_transformers_available() else None, "transformers_commit": get_git_revision_hash("transformers"), - "accelerate_version": accelerate_version(), + "accelerate_version": accelerate_version() if is_accelerate_available else None, "accelerate_commit": get_git_revision_hash("accelerate"), - "diffusers_version": diffusers_version(), + "diffusers_version": diffusers_version() if is_diffusers_available() else None, "diffusers_commit": get_git_revision_hash("diffusers"), - "optimum_version": optimum_version(), + "optimum_version": optimum_version() if is_optimum_available() else None, "optimum_commit": get_git_revision_hash("optimum"), - "timm_version": timm_version(), + "timm_version": timm_version() if is_timm_available() else None, "timm_commit": get_git_revision_hash("timm"), - "peft_version": peft_version(), + "peft_version": peft_version() if is_peft_available() else None, "peft_commit": get_git_revision_hash("peft"), } diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index 1f74a377..6b8d614f 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -264,7 +264,7 @@ def __init__(self, device: str, backend: str): self.start_time: Optional[float] = None self.prefilled: Optional[bool] = None - self.per_token_events: List[Union[float, torch.cuda.Event]] = [] + self.per_token_events: List[List[Union[float, torch.cuda.Event]]] = [] self.prefill_start_events: List[Union[float, torch.cuda.Event]] = [] self.prefill_end_events: List[Union[float, torch.cuda.Event]] = [] self.decode_start_events: List[Union[float, torch.cuda.Event]] = [] @@ -282,6 +282,9 @@ def reset(self): @contextmanager def track(self): + self.prefilled = False + self.per_token_events.append([]) + if self.is_distributed: torch.distributed.barrier() @@ -291,14 +294,10 @@ def track(self): else: self.prefill_start_events.append(time.perf_counter()) - self.prefilled = False - # this is where generate is called, # and for each decoded token, we record an event yield - self.prefilled = None - if self.is_asynchronous: self.decode_end_events.append(torch.cuda.Event(enable_timing=True)) self.decode_end_events[-1].record() @@ -308,6 +307,8 @@ def track(self): if self.is_distributed: torch.distributed.barrier() + self.prefilled = False + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): assert ( self.prefilled is not None @@ -319,13 +320,13 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): else: event = time.perf_counter() - self.per_token_events.append(event) - if not self.prefilled: self.prefill_end_events.append(event) self.decode_start_events.append(event) self.prefilled = True + self.per_token_events[-1].append(event) + return scores def get_prefill_latency(self) -> Latency: @@ -368,13 +369,15 @@ def get_per_token_latency(self) -> Latency: torch.cuda.synchronize() latencies_list = [ - self.per_token_events[i].elapsed_time(self.per_token_events[i + 1]) / 1e3 - for i in range(0, len(self.per_token_events) - 1) + self.per_token_events[i][j].elapsed_time(self.per_token_events[i][j + 1]) / 1e3 + for i in range(len(self.per_token_events)) + for j in range(0, len(self.per_token_events[i]) - 1) ] else: latencies_list = [ - (self.per_token_events[i + 1] - self.per_token_events[i]) - for i in range(0, len(self.per_token_events) - 1) + (self.per_token_events[i][j + 1] - self.per_token_events[i][j]) + for i in range(len(self.per_token_events)) + for j in range(0, len(self.per_token_events[i]) - 1) ] assert not any(latency < 0 for latency in latencies_list), "Negative latency detected" From e291e9b18cfb323457af60f15f2bb33803718668 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 15 Jul 2024 16:51:54 +0200 Subject: [PATCH 4/6] Refactor backends and add `load` tracking (#227) --- examples/pytorch_llama.py | 63 ++++ ...orch_llama_awq.yaml => pytorch_llama.yaml} | 23 +- examples/pytorch_llama_awq.py | 28 -- optimum_benchmark/backends/base.py | 65 ++-- optimum_benchmark/backends/config.py | 12 +- optimum_benchmark/backends/diffusers_utils.py | 39 ++- .../backends/llm_swarm/backend.py | 21 +- .../backends/neural_compressor/backend.py | 71 ++-- .../backends/onnxruntime/backend.py | 142 ++++---- .../backends/onnxruntime/utils.py | 20 +- .../backends/openvino/backend.py | 162 +++++---- optimum_benchmark/backends/openvino/utils.py | 23 +- optimum_benchmark/backends/py_txi/backend.py | 28 +- optimum_benchmark/backends/pytorch/backend.py | 318 ++++++++++-------- .../backends/tensorrt_llm/backend.py | 33 +- optimum_benchmark/backends/timm_utils.py | 13 +- .../backends/torch_ort/backend.py | 41 +-- .../backends/transformers_utils.py | 78 ++++- optimum_benchmark/backends/vllm/backend.py | 43 ++- .../launchers/process/launcher.py | 9 +- .../scenarios/energy_star/scenario.py | 11 +- .../scenarios/inference/config.py | 16 +- .../scenarios/inference/scenario.py | 104 ++++-- .../scenarios/training/scenario.py | 27 +- optimum_benchmark/task_utils.py | 268 +++++++-------- optimum_benchmark/trackers/latency.py | 19 +- tests/configs/_bert_.yaml | 1 + tests/configs/_diffusers_.yaml | 2 +- tests/configs/cuda_inference_py_txi_bert.yaml | 2 +- tests/test_api.py | 2 +- 30 files changed, 940 insertions(+), 744 deletions(-) create mode 100644 examples/pytorch_llama.py rename examples/{pytorch_llama_awq.yaml => pytorch_llama.yaml} (51%) delete mode 100644 examples/pytorch_llama_awq.py diff --git a/examples/pytorch_llama.py b/examples/pytorch_llama.py new file mode 100644 index 00000000..5ecf5573 --- /dev/null +++ b/examples/pytorch_llama.py @@ -0,0 +1,63 @@ +import os + +from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig +from optimum_benchmark.logging_utils import setup_logging + +BENCHMARK_NAME = "pytorch-llama" + +WEIGHTS_CONFIGS = { + "float16": { + "torch_dtype": "float16", + "quantization_scheme": None, + "quantization_config": {}, + }, + # "4bit-awq-gemm": { + # "torch_dtype": "float16", + # "quantization_scheme": "awq", + # "quantization_config": {"bits": 4, "version": "gemm"}, + # }, + # "4bit-gptq-exllama-v2": { + # "torch_dtype": "float16", + # "quantization_scheme": "gptq", + # "quantization_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, + # }, +} + + +def run_benchmark(weight_config: str): + launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn") + backend_config = PyTorchConfig( + device="cuda", + device_ids="0", + no_weights=True, + model="gpt2", + **WEIGHTS_CONFIGS[weight_config], + ) + scenario_config = InferenceConfig( + memory=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes={"batch_size": 1, "sequence_length": 128}, + generate_kwargs={"max_new_tokens": 32, "min_new_tokens": 32}, + ) + + benchmark_config = BenchmarkConfig( + name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config + ) + benchmark_report = Benchmark.launch(benchmark_config) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + + filename = f"{BENCHMARK_NAME}-{backend_config.version}-{weight_config}.json" + benchmark.push_to_hub(repo_id="optimum-benchmark/pytorch-llama", filename=filename) + benchmark.save_json(path=f"benchmarks/{filename}") + + +if __name__ == "__main__": + level = os.environ.get("LOG_LEVEL", "INFO") + to_file = os.environ.get("LOG_TO_FILE", "0") == "1" + setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS") + + for weight_config in WEIGHTS_CONFIGS: + run_benchmark(weight_config) diff --git a/examples/pytorch_llama_awq.yaml b/examples/pytorch_llama.yaml similarity index 51% rename from examples/pytorch_llama_awq.yaml rename to examples/pytorch_llama.yaml index 34c8e957..becd1f2e 100644 --- a/examples/pytorch_llama_awq.yaml +++ b/examples/pytorch_llama.yaml @@ -3,24 +3,31 @@ defaults: - scenario: inference - launcher: process - backend: pytorch + - _base_ - _self_ -experiment_name: pytorch_llama_awq +name: pytorch_llama launcher: device_isolation: true device_isolation_action: warn backend: + model: gpt2 device: cuda - device_ids: 0 - no_weights: true - model: TheBloke/Llama-2-70B-AWQ + torch_dtype: float16 + +scenario: + memory: true + latency: true + + warmup_runs: 10 + iterations: 10 + duration: 10 -benchmark: input_shapes: batch_size: 1 - sequence_length: 128 + sequence_length: 256 generate_kwargs: - max_new_tokens: 100 - min_new_tokens: 100 + max_new_tokens: 32 + min_new_tokens: 32 diff --git a/examples/pytorch_llama_awq.py b/examples/pytorch_llama_awq.py deleted file mode 100644 index 96e100a9..00000000 --- a/examples/pytorch_llama_awq.py +++ /dev/null @@ -1,28 +0,0 @@ -from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig -from optimum_benchmark.logging_utils import setup_logging - -setup_logging(level="INFO", prefix="MAIN-PROCESS") - -if __name__ == "__main__": - BENCHMARK_NAME = "pytorch_llama_awq" - REPO_ID = f"IlyasMoutawwakil/{BENCHMARK_NAME}" - - scenario_config = InferenceConfig( - memory=True, - latency=True, - input_shapes={"batch_size": 1, "sequence_length": 128}, - generate_kwargs={"max_new_tokens": 100, "min_new_tokens": 100}, - ) - launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn") - backend_config = PyTorchConfig(device="cuda", device_ids="0", no_weights=True, model="TheBloke/Llama-2-70B-AWQ") - - benchmark_config = BenchmarkConfig( - name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config - ) - # benchmark_config.push_to_hub(repo_id=REPO_ID) - - benchmark_report = Benchmark.launch(benchmark_config) - # benchmark_report.push_to_hub(repo_id=REPO_ID) - - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - # benchmark.push_to_hub(repo_id=REPO_ID) diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index 8e6b9f89..8ae7e2cf 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -1,24 +1,34 @@ +import os from abc import ABC from collections import OrderedDict from logging import getLogger -from typing import Any, ClassVar, Dict, Generic, Optional, Tuple +from typing import Any, ClassVar, Dict, Generic, Optional import datasets.utils.logging as datasets_logging import transformers.utils.logging as transformers_logging +from safetensors.torch import save_file from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState, set_seed -from ..task_utils import get_automodel_class_for_task +from ..import_utils import is_torch_available from .config import BackendConfigT -from .diffusers_utils import extract_diffusers_shapes_from_model, get_diffusers_pretrained_config -from .timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config +from .diffusers_utils import ( + extract_diffusers_shapes_from_model, + get_diffusers_automodel_loader_for_task, + get_diffusers_pretrained_config, +) +from .timm_utils import extract_timm_shapes_from_config, get_timm_automodel_loader, get_timm_pretrained_config from .transformers_utils import ( PretrainedProcessor, extract_transformers_shapes_from_artifacts, + get_transformers_automodel_loader_for_task, get_transformers_generation_config, get_transformers_pretrained_config, get_transformers_pretrained_processor, ) +if is_torch_available(): + import torch + datasets_logging.set_verbosity_error() transformers_logging.set_verbosity_error() @@ -47,7 +57,7 @@ def __init__(self, config: BackendConfigT): self.logger.info("\t+ Benchmarking a Diffusers pipeline") self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.model_kwargs) self.model_shapes = extract_diffusers_shapes_from_model(self.config.model, **self.config.model_kwargs) - self.model_type = self.config.task + self.automodel_loader = get_diffusers_automodel_loader_for_task(self.config.task) self.pretrained_processor = None self.generation_config = None @@ -55,7 +65,7 @@ def __init__(self, config: BackendConfigT): self.logger.info("\t+ Benchmarking a Timm model") self.pretrained_config = get_timm_pretrained_config(self.config.model) self.model_shapes = extract_timm_shapes_from_config(self.pretrained_config) - self.model_type = self.pretrained_config.architecture + self.automodel_loader = get_timm_automodel_loader() self.pretrained_processor = None self.generation_config = None @@ -69,31 +79,42 @@ def __init__(self, config: BackendConfigT): self.model_shapes = extract_transformers_shapes_from_artifacts( self.pretrained_config, self.pretrained_processor ) - self.model_type = self.pretrained_config.model_type - - self.automodel_class = get_automodel_class_for_task( - model_type=self.model_type, library=self.config.library, task=self.config.task, framework="pt" - ) - self.logger.info(f"\t+ Using automodel class {self.automodel_class.__name__}") + self.automodel_loader = get_transformers_automodel_loader_for_task(self.config.task) def seed(self) -> None: set_seed(self.config.seed) - def prepare_for_inference(self, **kwargs) -> None: + def create_no_weights_model(self) -> None: + if self.pretrained_config is None: + raise ValueError("Can't create no weights model without a pretrained config") + + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") + self.logger.info("\t+ Creating no weights model's directory") + os.makedirs(self.no_weights_model, exist_ok=True) + self.logger.info("\t+ Creating no weights model's state dict") + state_dict = torch.nn.Linear(1, 1).state_dict() + self.logger.info("\t+ Saving no weights model's safetensors") + safetensors = os.path.join(self.no_weights_model, "model.safetensors") + save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) + self.logger.info("\t+ Saving no weights model's config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + + def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]: """ - This method is used to prepare the model for inference. - It can be used to compile the model with certain input/output shapes, for example. + This method is used to prepare and register the input shapes before using them by the model. + It can be used to pad the inputs to the correct shape, or compile it to the correct format. """ - pass + return input_shapes - def prepare_inputs( - self, inputs: Dict[str, Any], input_shapes: Dict[str, Any] - ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """ - This method is used to prepare the inputs before passing them to the model. - It can be used to move the inputs to the correct device, for example. + This method is used to prepare and register the inputs before passing them to the model. + It can be used to move the inputs to the correct device, or rename their keys. """ - return inputs, input_shapes + return inputs + + def load(self) -> None: + raise NotImplementedError("Backend must implement load method") def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: """ diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py index 9113d717..ba09267b 100644 --- a/optimum_benchmark/backends/config.py +++ b/optimum_benchmark/backends/config.py @@ -7,7 +7,11 @@ from psutil import cpu_count from ..system_utils import get_gpu_device_ids, is_nvidia_system, is_rocm_system -from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path +from ..task_utils import ( + infer_library_from_model_name_or_path, + infer_model_type_from_model_name_or_path, + infer_task_from_model_name_or_path, +) LOGGER = getLogger("backend") @@ -20,6 +24,7 @@ class BackendConfig(ABC): task: Optional[str] = None library: Optional[str] = None + model_type: Optional[str] = None model: Optional[str] = None processor: Optional[str] = None @@ -63,6 +68,11 @@ def __post_init__(self): if self.library is None: self.library = infer_library_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None)) + if self.model_type is None: + self.model_type = infer_model_type_from_model_name_or_path( + self.model, self.hub_kwargs.get("revision", None) + ) + if self.device is None: self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu" diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py index a9b5b5a7..ef6f9376 100644 --- a/optimum_benchmark/backends/diffusers_utils.py +++ b/optimum_benchmark/backends/diffusers_utils.py @@ -5,11 +5,40 @@ from ..import_utils import is_diffusers_available if is_diffusers_available(): - import diffusers # type: ignore + import diffusers + from diffusers import DiffusionPipeline + + if hasattr(diffusers, "pipelines") and hasattr(diffusers.pipelines, "auto_pipeline"): + from diffusers.pipelines.auto_pipeline import ( + AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, + AUTO_INPAINT_PIPELINES_MAPPING, + AUTO_TEXT2IMAGE_PIPELINES_MAPPING, + ) + + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = { + "inpainting": AUTO_INPAINT_PIPELINES_MAPPING.copy(), + "text-to-image": AUTO_TEXT2IMAGE_PIPELINES_MAPPING.copy(), + "image-to-image": AUTO_IMAGE2IMAGE_PIPELINES_MAPPING.copy(), + } + + for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES.items(): + for model_type, model_class in model_mapping.items(): + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name][model_type] = model_class.__name__ + else: + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {} +else: + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {} + + +TASKS_TO_MODEL_LOADERS = { + "inpainting": "AutoPipelineForInpainting", + "text-to-image": "AutoPipelineForText2Image", + "image-to-image": "AutoPipelineForImage2Image", +} def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]: - return diffusers.DiffusionPipeline.load_config(model, **kwargs) + return DiffusionPipeline.load_config(model, **kwargs) def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]: @@ -38,3 +67,9 @@ def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]: shapes["width"] = -1 return shapes + + +def get_diffusers_automodel_loader_for_task(task: str): + model_loader_name = TASKS_TO_MODEL_LOADERS[task] + model_loader_class = getattr(diffusers, model_loader_name) + return model_loader_class diff --git a/optimum_benchmark/backends/llm_swarm/backend.py b/optimum_benchmark/backends/llm_swarm/backend.py index dd08b9b7..8139e4ea 100644 --- a/optimum_benchmark/backends/llm_swarm/backend.py +++ b/optimum_benchmark/backends/llm_swarm/backend.py @@ -1,5 +1,5 @@ import asyncio -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List import torch from huggingface_hub import AsyncInferenceClient @@ -16,8 +16,11 @@ class LLMSwarmBackend(Backend[LLMSwarmConfig]): def __init__(self, config: LLMSwarmConfig) -> None: super().__init__(config) - self.validate_task() + if self.config.task not in TEXT_GENERATION_TASKS: + raise NotImplementedError(f"LLM Swarm does not support task {self.config.task}") + + def load(self) -> None: self.logger.info("\t+ Downloading pretrained model") self.download_pretrained_model() self.logger.info("\t+ Preparing generation config") @@ -25,10 +28,6 @@ def __init__(self, config: LLMSwarmConfig) -> None: self.logger.info("\t+ Loading pretrained model") self.load_model_from_pretrained() - def validate_task(self) -> None: - if self.config.task not in TEXT_GENERATION_TASKS: - raise NotImplementedError(f"LLM Swarm does not support task {self.config.task}") - def load_model_from_pretrained(self) -> None: self.llm_swarm_config = LLMSwarmCfg( gpus=self.config.gpus, @@ -46,7 +45,7 @@ def load_model_from_pretrained(self) -> None: def download_pretrained_model(self) -> None: with torch.device("meta"): - self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs) + self.auto_model_loader.from_pretrained(self.config.model, **self.config.model_kwargs) def prepare_generation_config(self) -> None: self.generation_config.eos_token_id = -100 @@ -60,11 +59,7 @@ def prepare_generation_config(self) -> None: self.logger.info("\t+ Saving new pretrained generation config") self.generation_config.save_pretrained(save_directory=model_snapshot_path) - def prepare_inputs( - self, inputs: Dict[str, Any], input_shapes: Dict[str, Any] - ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - inputs, input_shapes = super().prepare_inputs(inputs, input_shapes) - + def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: if "inputs" in inputs: inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["inputs"].tolist())} elif "input_ids" in inputs: @@ -72,7 +67,7 @@ def prepare_inputs( else: raise ValueError("inputs must contain either input_ids or inputs") - return inputs, input_shapes + return inputs async def single_client_call(self, prompt: str, kwargs: Dict[str, Any]) -> str: return await self.client.text_generation(prompt, max_new_tokens=kwargs.get("max_new_tokens", 1)) diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py index 7ee43635..c180a5ba 100644 --- a/optimum_benchmark/backends/neural_compressor/backend.py +++ b/optimum_benchmark/backends/neural_compressor/backend.py @@ -10,7 +10,7 @@ from ...generators.dataset_generator import DatasetGenerator from ..base import Backend -from ..transformers_utils import random_init_weights +from ..transformers_utils import fast_weights_init from .config import INCConfig from .utils import TASKS_TO_INCMODELS @@ -20,89 +20,82 @@ class INCBackend(Backend[INCConfig]): def __init__(self, config: INCConfig): super().__init__(config) - self.validate_task() + if self.config.task in TASKS_TO_INCMODELS: + self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task]) + self.logger.info(f"Using INCModel class {self.incmodel_class.__name__}") + else: + raise NotImplementedError(f"INCBackend does not support task {self.config.task}") + + def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() if self.config.ptq_quantization: if self.config.no_weights: + self.logger.info("\t+ Creating no weights AutoModel") + self.create_no_weights_model() self.logger.info("\t+ Loading no weights AutoModel") self.load_automodel_with_no_weights() else: self.logger.info("\t+ Loading pretrained AutoModel") self.load_automodel_from_pretrained() - self.logger.info("\t+ Applying post-training quantization") self.quantize_automodel() - self.logger.info("\t+ Loading quantized INCModel") original_model, self.config.model = self.config.model, self.quantized_model self.load_incmodel_from_pretrained() self.config.model = original_model - elif self.config.no_weights: + self.logger.info("\t+ Creating no weights INCModel") + self.create_no_weights_model() self.logger.info("\t+ Loading no weights INCModel") self.load_incmodel_with_no_weights() - else: self.logger.info("\t+ Loading pretrained INCModel") self.load_incmodel_from_pretrained() self.tmpdir.cleanup() - def validate_task(self) -> None: - if self.config.task not in TASKS_TO_INCMODELS: - raise NotImplementedError(f"INCBackend does not support task {self.config.task}") - - self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task]) - self.logger.info(f"Using INCModel class {self.incmodel_class.__name__}") - def load_automodel_from_pretrained(self) -> None: - self.pretrained_model = self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs) - - def create_no_weights_model(self) -> None: - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - self.logger.info("\t+ Creating no weights model directory") - os.makedirs(self.no_weights_model, exist_ok=True) - self.logger.info("\t+ Creating no weights model state dict") - state_dict = torch.nn.Linear(1, 1).state_dict() - self.logger.info("\t+ Saving no weights model pytorch_model.bin") - torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin")) - - if self.config.library == "transformers": - self.logger.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs) def load_automodel_with_no_weights(self) -> None: - self.logger.info("\t+ Creating no weights model") - self.create_no_weights_model() + original_model, self.config.model = self.config.model, self.no_weights_model - with random_init_weights(): - original_model, self.config.model = self.config.model, self.no_weights_model - self.logger.info("\t+ Loading no weights AutoModel") + with fast_weights_init(): self.load_automodel_from_pretrained() - self.config.model = original_model self.logger.info("\t+ Tying model weights") self.pretrained_model.tie_weights() + self.config.model = original_model + def load_incmodel_from_pretrained(self) -> None: self.pretrained_model = self.incmodel_class.from_pretrained(self.config.model, **self.config.model_kwargs) def load_incmodel_with_no_weights(self) -> None: - self.logger.info("\t+ Creating no weights model") - self.create_no_weights_model() + original_model, self.config.model = self.config.model, self.no_weights_model - with random_init_weights(): - original_model, self.config.model = self.config.model, self.no_weights_model - self.logger.info("\t+ Loading no weights INCModel") + with fast_weights_init(): self.load_incmodel_from_pretrained() - self.config.model = original_model self.logger.info("\t+ Tying model weights") self.pretrained_model.model.tie_weights() + self.config.model = original_model + + def create_no_weights_model(self) -> None: + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") + self.logger.info("\t+ Creating no weights model directory") + os.makedirs(self.no_weights_model, exist_ok=True) + self.logger.info("\t+ Creating no weights model state dict") + state_dict = torch.nn.Linear(1, 1).state_dict() + self.logger.info("\t+ Saving no weights model pytorch_model.bin") + torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin")) + self.logger.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + def quantize_automodel(self) -> None: self.quantized_model = f"{self.tmpdir.name}/quantized_model" self.logger.info("\t+ Processing quantization config") diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py index de8c52ac..8fb69254 100644 --- a/optimum_benchmark/backends/onnxruntime/backend.py +++ b/optimum_benchmark/backends/onnxruntime/backend.py @@ -1,12 +1,17 @@ import os from collections import OrderedDict from tempfile import TemporaryDirectory -from typing import Any, Dict, List, Tuple +from typing import Any, Dict import torch from hydra.utils import get_class from onnxruntime import SessionOptions -from optimum.onnxruntime import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ORTOptimizer, ORTQuantizer +from optimum.onnxruntime import ( + ONNX_DECODER_NAME, + ONNX_DECODER_WITH_PAST_NAME, + ORTOptimizer, + ORTQuantizer, +) from optimum.onnxruntime.configuration import ( AutoCalibrationConfig, AutoOptimizationConfig, @@ -15,15 +20,19 @@ OptimizationConfig, QuantizationConfig, ) -from safetensors.torch import save_file from ...generators.dataset_generator import DatasetGenerator from ...import_utils import is_accelerate_available, is_torch_distributed_available from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend -from ..transformers_utils import random_init_weights +from ..transformers_utils import fast_weights_init from .config import ORTConfig -from .utils import TASKS_TO_ORTMODELS, TASKS_TO_ORTSD, format_calibration_config, format_quantization_config +from .utils import ( + TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES, + TASKS_TO_ORTMODELS, + format_calibration_config, + format_quantization_config, +) if is_accelerate_available(): from accelerate import Accelerator @@ -37,7 +46,22 @@ class ORTBackend(Backend[ORTConfig]): def __init__(self, config: ORTConfig) -> None: super().__init__(config) - self.validate_task() + + if self.config.task in TASKS_TO_ORTMODELS: + self.ort_model_loader = get_class(TASKS_TO_ORTMODELS[self.config.task]) + self.logger.info(f"Using ORT Model class {self.ort_model_loader.__name__}") + elif self.config.task in TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES: + if self.config.model_type in TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES[self.config.task]: + self.ort_model_loader = get_class( + TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES[self.config.task][self.config.model_type] + ) + self.logger.info(f"Using ORT Pipeline class {self.ort_model_loader.__name__}") + else: + raise NotImplementedError( + f"ORTBackend does not support model {self.config.model_type} for task {self.config.task}" + ) + else: + raise NotImplementedError(f"ORTBackend does not support task {self.config.task}") self.session_options = SessionOptions() if self.config.session_options: @@ -45,10 +69,19 @@ def __init__(self, config: ORTConfig) -> None: for key, value in self.config.session_options.items(): setattr(self.session_options, key, value) + def validate_execution_provider(self) -> None: + if not self.pretrained_model.providers[0] == self.config.provider: + raise ValueError( + f"{self.config.provider} is not first in providers list: {self.pretrained_model.providers}" + ) + + def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() if self.config.no_weights: + self.logger.info("\t+ Creating no weights ORTModel") + self.create_no_weights_model() self.logger.info("\t+ Loading no weights ORTModel") self.load_ortmodel_with_no_weights() else: @@ -70,55 +103,19 @@ def __init__(self, config: ORTConfig) -> None: if self.is_optimized or self.is_quantized: original_export, self.config.export = self.config.export, False - self.logger.info("\t+ Loading optimized/quantized ORTModel") + self.logger.info("\t+ Loading optimized/quantized model") self.load_ortmodel_from_pretrained() - self.config.model, self.config.export = original_model, original_export - - self.validate_provider() - self.tmpdir.cleanup() - - def validate_task(self) -> None: - if self.config.task in TASKS_TO_ORTSD: - self.ortmodel_class = get_class(TASKS_TO_ORTSD[self.config.task]) - self.logger.info(f"Using ORTStableDiffusion class {self.ortmodel_class.__name__}") - elif self.config.task in TASKS_TO_ORTMODELS: - self.ortmodel_class = get_class(TASKS_TO_ORTMODELS[self.config.task]) - self.logger.info(f"Using ORTModel class {self.ortmodel_class.__name__}") - else: - raise NotImplementedError(f"ORTBackend does not support task {self.config.task}") - - def validate_provider(self) -> None: - if not self.pretrained_model.providers[0] == self.config.provider: - raise ValueError( - f"{self.config.provider} is not first in providers list: {self.pretrained_model.providers}" - ) - - def create_no_weights_model(self) -> None: - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - self.logger.info("\t+ Creating no weights model directory") - os.makedirs(self.no_weights_model, exist_ok=True) - self.logger.info("\t+ Creating no weights model state dict") - state_dict = torch.nn.Linear(1, 1).state_dict() - self.logger.info("\t+ Saving no weights model safetensors") - safetensors = os.path.join(self.no_weights_model, "model.safetensors") - save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) - - if self.config.library == "transformers": - self.logger.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + self.config.export = original_export + self.config.model = original_model - def load_ortmodel_with_no_weights(self) -> None: - self.logger.info("\t+ Creating no weights model") - self.create_no_weights_model() + self.logger.info("\t+ Validating requested Execution Provider") + self.validate_execution_provider() - with random_init_weights(): - original_model, self.config.model = self.config.model, self.no_weights_model - self.logger.info("\t+ Loading no weights ORTModel") - self.load_ortmodel_from_pretrained() - self.config.model = original_model + self.logger.info("\t+ Cleaning up backend temporary directory") + self.tmpdir.cleanup() def load_ortmodel_from_pretrained(self) -> None: - self.pretrained_model = self.ortmodel_class.from_pretrained( + self.pretrained_model = self.ort_model_loader.from_pretrained( self.config.model, export=self.config.export, session_options=self.session_options, @@ -129,6 +126,14 @@ def load_ortmodel_from_pretrained(self) -> None: **self.ortmodel_kwargs, ) + def load_ortmodel_with_no_weights(self) -> None: + original_model, self.config.model = self.config.model, self.no_weights_model + + with fast_weights_init(): + self.load_ortmodel_from_pretrained() + + self.config.model = original_model + @property def is_optimized(self) -> bool: return (self.config.auto_optimization is not None) or self.config.optimization @@ -167,15 +172,6 @@ def onnx_files_names(self): else: return [file for file in os.listdir(self.config.model) if file.endswith(".onnx")] - @property - def inputs_names(self) -> List[str]: - if hasattr(self.pretrained_model, "inputs_names"): - return self.pretrained_model.inputs_names - elif hasattr(self.pretrained_model, "input_names"): - return self.pretrained_model.input_names - else: - return [] - def optimize_onnx_files(self) -> None: self.logger.info("\t+ Attempting optimization") self.optimized_model = os.path.join(self.tmpdir.name, "optimized") @@ -231,7 +227,7 @@ def quantize_onnx_files(self) -> None: calibration_dataset = DatasetGenerator( task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes )() - columns_to_be_removed = list(set(calibration_dataset.column_names) - set(self.inputs_names)) + columns_to_be_removed = list(set(calibration_dataset.column_names) - set(self.pretrained_model.input_names)) calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) self.logger.info("\t+ Processing calibration config") @@ -284,32 +280,34 @@ def quantize_onnx_files(self) -> None: if self.pretrained_config is not None: self.pretrained_config.save_pretrained(self.quantized_model) - def prepare_inputs( - self, inputs: Dict[str, Any], input_shapes: Dict[str, Any] - ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - inputs, input_shapes = super().prepare_inputs(inputs, input_shapes) - + def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]: if self.is_dp_distributed: if input_shapes["batch_size"] % torch.distributed.get_world_size() != 0: raise ValueError( - f"Batch size {input_shapes['batch_size']} must be divisible by data parallel " - f"world size {torch.distributed.get_world_size()}" + f"Batch size {input_shapes['batch_size']} must be divisible by " + f"data parallel world size {torch.distributed.get_world_size()}" ) - with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as split_inputs: - input_shapes["batch_size"] = input_shapes["batch_size"] // torch.distributed.get_world_size() - inputs = split_inputs + # distributing batch size across processes + input_shapes["batch_size"] //= torch.distributed.get_world_size() + + return input_shapes + + def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + if self.is_dp_distributed: + with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as process_inputs: + inputs = process_inputs if self.config.library == "transformers": for key, value in list(inputs.items()): if key in ["position_ids", "token_type_ids"]: - if key not in self.inputs_names: + if key not in self.pretrained_model.input_names: inputs.pop(key) for key, value in inputs.items(): if isinstance(value, torch.Tensor): inputs[key] = value.to(self.config.device) - return inputs, input_shapes + return inputs @torch.inference_mode() def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: diff --git a/optimum_benchmark/backends/onnxruntime/utils.py b/optimum_benchmark/backends/onnxruntime/utils.py index 86eeeed9..6177ae8e 100644 --- a/optimum_benchmark/backends/onnxruntime/utils.py +++ b/optimum_benchmark/backends/onnxruntime/utils.py @@ -3,15 +3,25 @@ from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantizationMode, QuantType from optimum.pipelines import ORT_SUPPORTED_TASKS -TASKS_TO_ORTSD = { - "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline", - "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionXLPipeline", -} - TASKS_TO_ORTMODELS = { task: f"optimum.onnxruntime.{task_dict['class'][0].__name__}" for task, task_dict in ORT_SUPPORTED_TASKS.items() } +TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES = { + "text-to-image": { + "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline", + "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionXLPipeline", + "latent-consistency": "optimum.onnxruntime.ORTLatentConsistencyModelPipeline", + }, + "image-to-image": { + "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionImg2ImgPipeline", + "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionImg2ImgXLPipeline", + }, + "inpainting": { + "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionInpaintingPipeline", + }, +} + def format_calibration_config(calibration_config: Dict[str, Any]) -> None: if calibration_config.get("method", None) is not None: diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py index 91e8304f..cd2a57af 100644 --- a/optimum_benchmark/backends/openvino/backend.py +++ b/optimum_benchmark/backends/openvino/backend.py @@ -1,23 +1,21 @@ import inspect -import os from collections import OrderedDict from tempfile import TemporaryDirectory -from typing import Any, Dict, Tuple +from typing import Any, Dict import torch from hydra.utils import get_class from openvino.runtime import properties from optimum.intel.openvino import OVConfig as OVQuantizationConfig # naming conflict from optimum.intel.openvino import OVQuantizer -from safetensors.torch import save_file from ...generators.dataset_generator import DatasetGenerator from ...import_utils import is_accelerate_available, is_torch_distributed_available from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend -from ..transformers_utils import random_init_weights +from ..transformers_utils import fast_weights_init from .config import OVConfig -from .utils import TASKS_TO_OVMODEL +from .utils import TASKS_TO_MODEL_TYPES_TO_OVPIPELINE, TASKS_TO_OVMODEL if is_accelerate_available(): from accelerate import Accelerator @@ -31,91 +29,94 @@ class OVBackend(Backend[OVConfig]): def __init__(self, config: OVConfig) -> None: super().__init__(config) - self.validate_task() + + if self.config.task in TASKS_TO_OVMODEL: + self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.config.task]) + self.logger.info(f"\t+ Using OVModel class {self.ovmodel_class.__name__}") + elif self.config.task in TASKS_TO_MODEL_TYPES_TO_OVPIPELINE: + if self.config.model_type in TASKS_TO_MODEL_TYPES_TO_OVPIPELINE[self.config.task]: + self.ovmodel_class = get_class( + TASKS_TO_MODEL_TYPES_TO_OVPIPELINE[self.config.task][self.config.model_type] + ) + self.logger.info(f"\t+ Using OVPipeline class {self.ovmodel_class.__name__}") + else: + raise NotImplementedError( + f"OVBackend does not support model {self.config.model_type} for task {self.config.task}" + ) + else: + raise NotImplementedError(f"OVBackend does not support task {self.config.task}") if self.config.inter_op_num_threads is not None: self.logger.info(f"\t+ Setting inter_op_num_threads to {self.config.inter_op_num_threads}") self.config.openvino_config[properties.inference_num_threads()] = self.config.inter_op_num_threads + def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() if self.config.quantization: if self.config.no_weights: + self.logger.info("\t+ Creating no weights AutoModel") + self.create_no_weights_model() self.logger.info("\t+ Loading no weights AutoModel") - self.load_automodel_with_no_weights() + self._load_automodel_with_no_weights() else: self.logger.info("\t+ Loading pretrained AutoModel") - self.load_automodel_from_pretrained() - + self._load_automodel_from_pretrained() self.logger.info("\t+ Applying post-training quantization") self.quantize_automodel() - original_model, self.config.model = self.config.model, self.quantized_model original_export, self.config.export = self.config.export, False self.logger.info("\t+ Loading quantized OVModel") - self.load_ovmodel_from_pretrained() + self._load_ovmodel_from_pretrained() self.config.model, self.config.export = original_model, original_export - elif self.config.no_weights: + self.logger.info("\t+ Creating no weights OVModel") + self.create_no_weights_model() self.logger.info("\t+ Loading no weights OVModel") - self.load_ovmodel_with_no_weights() + self._load_ovmodel_with_no_weights() else: self.logger.info("\t+ Loading pretrained OVModel") - self.load_ovmodel_from_pretrained() + self._load_ovmodel_from_pretrained() - self.tmpdir.cleanup() + if self.config.reshape: + static_shapes = { + key: value + for key, value in {**self.input_shapes, **self.model_shapes}.items() + if key in inspect.getfullargspec(self.pretrained_model.reshape).args + } + if ("sequence_length" in static_shapes) and ("height" in static_shapes) and ("width" in static_shapes): + # for vision models, sequence_length is the number of channels + static_shapes["sequence_length"] = self.model_shapes.get("num_channels") - def validate_task(self) -> None: - if self.config.task not in TASKS_TO_OVMODEL: - raise NotImplementedError(f"OVBackend does not support task {self.config.task}") + self.logger.info(f"\t+ Reshaping model with static shapes: {static_shapes}") + self.pretrained_model.reshape(**static_shapes) + + if self.config.half: + self.logger.info("\t+ Converting model to half precision") + self.pretrained_model.half() - self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.config.task]) - self.logger.info(f"\t+ Using OVModel class {self.ovmodel_class.__name__}") + if self.config.reshape or self.config.half: + self.logger.info("\t+ Compiling model") + self.pretrained_model.compile() - def create_no_weights_model(self) -> None: - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - self.logger.info("\t+ Creating no weights model directory") - os.makedirs(self.no_weights_model, exist_ok=True) - self.logger.info("\t+ Creating no weights model state dict") - state_dict = torch.nn.Linear(1, 1).state_dict() - self.logger.info("\t+ Saving no weights model safetensors") - safetensors = os.path.join(self.no_weights_model, "model.safetensors") - save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) + self.tmpdir.cleanup() - if self.config.library == "transformers": - self.logger.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + def _load_automodel_from_pretrained(self) -> None: + self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs) - def load_automodel_with_no_weights(self) -> None: - self.logger.info("\t+ Creating no weights model") - self.create_no_weights_model() + def _load_automodel_with_no_weights(self) -> None: + original_model, self.config.model = self.config.model, self.no_weights_model - with random_init_weights(): - original_model, self.config.model = self.config.model, self.no_weights_model - self.logger.info("\t+ Loading no weights AutoModel") - self.load_automodel_from_pretrained() - self.config.model = original_model + with fast_weights_init(): + self._load_automodel_from_pretrained() self.logger.info("\t+ Tying model weights") self.pretrained_model.tie_weights() - def load_automodel_from_pretrained(self) -> None: - self.pretrained_model = self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs) - - def load_ovmodel_with_no_weights(self) -> None: - self.logger.info("\t+ Creating no weights model") - self.create_no_weights_model() + self.config.model = original_model - with random_init_weights(): - original_model, self.config.model = self.config.model, self.no_weights_model - original_export, self.config.export = self.config.export, True - self.logger.info("\t+ Loading no weights OVModel") - self.load_ovmodel_from_pretrained() - self.config.model = original_model - self.config.export = original_export - - def load_ovmodel_from_pretrained(self) -> None: + def _load_ovmodel_from_pretrained(self) -> None: self.pretrained_model = self.ovmodel_class.from_pretrained( self.config.model, export=self.config.export, @@ -125,6 +126,15 @@ def load_ovmodel_from_pretrained(self) -> None: **self.ovmodel_kwargs, ) + def _load_ovmodel_with_no_weights(self) -> None: + with fast_weights_init(): + original_model, self.config.model = self.config.model, self.no_weights_model + original_export, self.config.export = self.config.export, True + self.logger.info("\t+ Loading no weights OVModel") + self._load_ovmodel_from_pretrained() + self.config.export = original_export + self.config.model = original_model + @property def is_dp_distributed(self) -> bool: return is_torch_distributed_available() and torch.distributed.is_initialized() @@ -171,43 +181,27 @@ def quantize_automodel(self) -> None: batch_size=1, ) - def prepare_inputs( - self, inputs: Dict[str, Any], input_shapes: Dict[str, Any] - ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - inputs, input_shapes = super().prepare_inputs(inputs, input_shapes) - + def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]: if self.is_dp_distributed: if input_shapes["batch_size"] % torch.distributed.get_world_size() != 0: raise ValueError( - f"Batch size {input_shapes['batch_size']} must be divisible by data parallel " - f"world size {torch.distributed.get_world_size()}" + f"Batch size {input_shapes['batch_size']} must be divisible by " + f"data parallel world size {torch.distributed.get_world_size()}" ) - with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as split_inputs: - input_shapes["batch_size"] = input_shapes["batch_size"] // torch.distributed.get_world_size() - inputs = split_inputs - - return inputs, input_shapes + # distributing batch size across processes + input_shapes["batch_size"] //= torch.distributed.get_world_size() - def prepare_for_inference(self, **kwargs) -> None: - if self.config.reshape: - static_shapes = { - key: value - for key, value in kwargs.items() - if key in inspect.getfullargspec(self.pretrained_model.reshape).args - } - if (static_shapes.get("height", None) is not None) and ("sequence_length" in static_shapes): - static_shapes["sequence_length"] = kwargs.get("num_channels", 3) + # registering input shapes for usage during model reshaping + self.input_shapes = input_shapes - self.logger.info(f"\t+ Reshaping model with static shapes: {static_shapes}") - self.pretrained_model.reshape(**static_shapes) + return input_shapes - if self.config.half: - self.logger.info("\t+ Converting model to half precision") - self.pretrained_model.half() + def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + if self.is_dp_distributed: + with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as process_inputs: + inputs = process_inputs - if self.config.reshape or self.config.half: - self.logger.info("\t+ Compiling model") - self.pretrained_model.compile() + return inputs def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: return self.pretrained_model.forward(**inputs, **kwargs) diff --git a/optimum_benchmark/backends/openvino/utils.py b/optimum_benchmark/backends/openvino/utils.py index b1005f38..35518346 100644 --- a/optimum_benchmark/backends/openvino/utils.py +++ b/optimum_benchmark/backends/openvino/utils.py @@ -1,4 +1,19 @@ -from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS - -TASKS_TO_OVMODEL = {task: f"optimum.intel.openvino.{ovmodel}" for task, ovmodel in _HEAD_TO_AUTOMODELS.items()} -TASKS_TO_OVMODEL.update({"feature-extraction": "optimum.intel.openvino.OVModelForFeatureExtraction"}) +TASKS_TO_OVMODEL = { + "fill-mask": "optimum.intel.openvino.OVModelForMaskedLM", + "text-generation": "optimum.intel.openvino.OVModelForCausalLM", + "text2text-generation": "optimum.intel.openvino.OVModelForSeq2SeqLM", + "feature-extraction": "optimum.intel.openvino.OVModelForFeatureExtraction", + "text-classification": "optimum.intel.openvino.OVModelForSequenceClassification", + "token-classification": "optimum.intel.openvino.OVModelForTokenClassification", + "question-answering": "optimum.intel.openvino.OVModelForQuestionAnswering", + "image-classification": "optimum.intel.openvino.OVModelForImageClassification", + "audio-classification": "optimum.intel.openvino.OVModelForAudioClassification", + "pix2struct": "optimum.intel.openvino.OVModelForPix2Struct", +} +TASKS_TO_MODEL_TYPES_TO_OVPIPELINE = { + "text-to-image": { + "lcm": "optimum.intel.openvino.OVLatentConsistencyModelPipeline", + "stable-diffusion": "optimum.intel.openvino.OVStableDiffusionPipeline", + "stable-diffusion-xl": "optimum.intel.openvino.OVStableDiffusionXLPipeline", + }, +} diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index a93d7b5d..6e637a31 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -1,14 +1,15 @@ import os from tempfile import TemporaryDirectory -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List import torch +from accelerate import init_empty_weights from py_txi import TEI, TGI, TEIConfig, TGIConfig from safetensors.torch import save_file from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS from ..base import Backend -from ..transformers_utils import random_init_weights +from ..transformers_utils import fast_weights_init from .config import PyTXIConfig @@ -18,10 +19,13 @@ class PyTXIBackend(Backend[PyTXIConfig]): def __init__(self, config: PyTXIConfig) -> None: super().__init__(config) + def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() if self.config.no_weights: + self.logger.info("\t+ Creating no weights model") + self.create_no_weights_model() self.logger.info("\t+ Loading no weights model") self.load_model_with_no_weights() else: @@ -43,8 +47,8 @@ def volume(self) -> str: def download_pretrained_model(self) -> None: # directly downloads pretrained model in volume (/data) to change generation config before loading model - with torch.device("meta"): - self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs, cache_dir=self.volume) + with init_empty_weights(include_buffers=True): + self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs, cache_dir=self.volume) def prepare_generation_config(self) -> None: self.generation_config.eos_token_id = None @@ -73,8 +77,8 @@ def create_no_weights_model(self) -> None: self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model self.logger.info(f"\t+ Loading no weights model from {self.no_weights_model}") - with random_init_weights(): - self.pretrained_model = self.automodel_class.from_pretrained( + with fast_weights_init(): + self.pretrained_model = self.automodel_loader.from_pretrained( self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False ) self.logger.info("\t+ Saving no weights model") @@ -86,14 +90,10 @@ def create_no_weights_model(self) -> None: self.logger.info("\t+ Modifying generation config for fixed length generation") self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None - self.logger.info("\t+ Saving new pretrained generation config") self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: - self.logger.info("\t+ Creating no weights model") - self.create_no_weights_model() - original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}} original_model, self.config.model = self.config.model, "/data/no_weights_model" self.logger.info("\t+ Loading no weights model") @@ -139,11 +139,7 @@ def load_model_from_pretrained(self) -> None: else: raise NotImplementedError(f"TXI does not support task {self.config.task}") - def prepare_inputs( - self, inputs: Dict[str, Any], input_shapes: Dict[str, Any] - ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - inputs, input_shapes = super().prepare_inputs(inputs, input_shapes) - + def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: if self.config.task in TEXT_GENERATION_TASKS: inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["input_ids"].tolist())} elif self.config.task in TEXT_EMBEDDING_TASKS: @@ -151,7 +147,7 @@ def prepare_inputs( else: raise NotImplementedError(f"TXI does not support task {self.config.task}") - return inputs, input_shapes + return inputs def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: return self.pretrained_model.encode(**inputs, **kwargs) diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py index efcbaff4..33914164 100644 --- a/optimum_benchmark/backends/pytorch/backend.py +++ b/optimum_benchmark/backends/pytorch/backend.py @@ -1,10 +1,10 @@ import os from collections import OrderedDict from tempfile import TemporaryDirectory -from typing import Any, Callable, Dict, List, Tuple +from typing import Any, Callable, Dict, List import torch -from accelerate import Accelerator +from accelerate import Accelerator, init_empty_weights, init_on_device from datasets import Dataset from safetensors.torch import save_file from transformers import ( @@ -20,7 +20,7 @@ from ...import_utils import is_deepspeed_available, is_torch_distributed_available, is_zentorch_available from ..base import Backend from ..peft_utils import apply_peft -from ..transformers_utils import random_init_weights +from ..transformers_utils import fast_weights_init from .config import PyTorchConfig if is_deepspeed_available(): @@ -38,20 +38,12 @@ class PyTorchBackend(Backend[PyTorchConfig]): def __init__(self, config: PyTorchConfig): super().__init__(config) - self.validate_library() - - if self.config.deepspeed_inference and self.is_quantized: - raise ValueError("Deepspeed-Inference is not compatible with Transformers quantization") - - # Quantization - if self.is_quantized: - self.logger.info("\t+ Processing quantization config") - self.process_quantization_config() # Threads if self.config.inter_op_num_threads is not None: self.logger.info(f"\t+ Setting pytorch inter_op_num_threads({self.config.inter_op_num_threads}))") torch.set_num_threads(self.config.inter_op_num_threads) + if self.config.intra_op_num_threads is not None: self.logger.info(f"\t+ Setting pytorch intra_op_num_threads({self.config.intra_op_num_threads}))") torch.set_num_interop_threads(self.config.intra_op_num_threads) @@ -71,36 +63,111 @@ def __init__(self, config: PyTorchConfig): else: raise ValueError(f"Device {self.config.device} not supported for autocast") + def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() - # Model - if self.config.no_weights and (self.config.library == "diffusers" or self.config.library == "timm"): - raise ValueError("Diffusion pipelines and Timm models don't support no weights") - elif self.config.no_weights: - self.logger.info("\t+ Loading model with random weights") - self.load_model_with_no_weights() + if self.config.library == "transformers": + self.load_transformers_model() + elif self.config.library == "diffusers": + self.load_diffusers_model() + elif self.config.library == "timm": + self.load_timm_model() else: - self.logger.info("\t+ Loading model with pretrained weights") - self.load_model_from_pretrained() + raise ValueError(f"Library {self.config.library} not supported for PyTorch backend") + self.logger.info("\t+ Cleaning up backend temporary directory") self.tmpdir.cleanup() + def load_transformers_model_from_pretrained(self) -> None: + if self.is_quantized: + self.logger.info(f"\t+ Loading {self.quantization_config.quant_method}-quantized model") + self.pretrained_model = self.automodel_loader.from_pretrained( + pretrained_model_name_or_path=self.config.model, + device_map=self.config.device_map or torch.device(self.config.device), + # quantized models are more compatible with device_map dispatcher than (to(device)) + # using to(device) on quantized models sometimes leaves some layers on cpu or raises + # an error because the layers are already on the device + **self.config.model_kwargs, + **self.automodel_kwargs, + ) + elif self.config.device_map is not None: + self.logger.info(f"\t+ Loading Transformers model with device map: {self.config.device_map}") + self.pretrained_model = self.automodel_loader.from_pretrained( + pretrained_model_name_or_path=self.config.model, + device_map=self.config.device_map, + **self.config.model_kwargs, + **self.automodel_kwargs, + ) + else: + self.logger.info("\t+ Loading Transformers model") + self.pretrained_model = self.automodel_loader.from_pretrained( + pretrained_model_name_or_path=self.config.model, **self.config.model_kwargs, **self.automodel_kwargs + ) + if self.config.device != "cpu": + self.logger.info(f"\t+ Moving Transformers model to device: {self.config.device}") + self.pretrained_model = self.pretrained_model.to(self.config.device) + + def load_transformers_model_with_no_weights(self) -> None: + original_model, self.config.model = self.config.model, self.no_weights_model + + if self.config.deepspeed_inference: + with init_empty_weights(include_buffers=False): + self.logger.info("\t+ Loading Transformers model on meta device for fast initialization") + self.pretrained_model = self.automodel_loader.from_pretrained( + pretrained_model_name_or_path=self.config.model, + **self.config.model_kwargs, + **self.automodel_kwargs, + ) + self.pretrained_model.to_empty(device="cpu") + elif self.config.device_map is None and not self.is_quantized: + with init_on_device(device=torch.device(self.config.device), include_buffers=True): + self.logger.info("\t+ Loading Transformers model using device context manager for fast initialization") + self.pretrained_model = self.automodel_loader.from_pretrained( + pretrained_model_name_or_path=self.no_weights_model, + **self.config.model_kwargs, + **self.automodel_kwargs, + ) + else: + with fast_weights_init(): + self.load_transformers_model_from_pretrained() + + self.config.model = original_model + + def load_transformers_model(self): + if self.config.deepspeed_inference and self.is_quantized: + raise ValueError("Deepspeed-Inference is not compatible with Transformers quantization") + + # Quantization + if self.is_quantized: + self.logger.info("\t+ Processing quantization config") + self.process_quantization_config() + + # Model loading + if self.config.no_weights: + self.logger.info("\t+ Creating no weights model") + self.create_no_weights_model() + self.logger.info("\t+ Loading model with random weights") + self.load_transformers_model_with_no_weights() + else: + self.logger.info("\t+ Loading model with pretrained weights") + self.load_transformers_model_from_pretrained() + # KV-Cache if self.config.cache_implementation is not None: self.logger.info(f"\t+ Setting cache implementation to {self.config.cache_implementation}") self.pretrained_model.generation_config.cache_implementation = self.config.cache_implementation - # Eval mode - if self.config.eval_mode and self.config.library != "diffusers": - self.logger.info("\t+ Turning on model's eval mode") - self.pretrained_model.eval() - # BetterTransformer if self.config.to_bettertransformer: - self.logger.info("\t+ Enabling BetterTransformer") + self.logger.info("\t+ To BetterTransformer") self.pretrained_model.to_bettertransformer() + # Eval mode + if self.config.eval_mode: + self.logger.info("\t+ Enabling eval mode") + self.pretrained_model.eval() + # PEFT if self.config.peft_type is not None: self.logger.info("\t+ Applying PEFT") @@ -115,87 +182,76 @@ def __init__(self, config: PyTorchConfig): # Torch compile if self.config.torch_compile: - if self.config.library == "diffusers": - self.logger.info("\t+ Using torch.compile on unet and vae") - self.pretrained_model.unet = torch.compile( - self.pretrained_model.unet, **self.config.torch_compile_config - ) - self.pretrained_model.vae.decode = torch.compile( - self.pretrained_model.vae.decode, **self.config.torch_compile_config + if self.config.torch_compile_target == "forward": + self.logger.info("\t+ Using torch.compile on forward") + self.pretrained_model.forward = torch.compile( + self.pretrained_model.forward, **self.config.torch_compile_config ) + elif self.config.torch_compile_target == "model": + self.logger.info("\t+ Using torch.compile on model") + self.pretrained_model = torch.compile(self.pretrained_model, **self.config.torch_compile_config) else: - if self.config.torch_compile_target == "forward": - self.logger.info("\t+ Using torch.compile on forward") - self.pretrained_model.forward = torch.compile( - self.pretrained_model.forward, **self.config.torch_compile_config - ) - elif self.config.torch_compile_target == "model": - self.logger.info("\t+ Using torch.compile on model") - self.pretrained_model = torch.compile(self.pretrained_model, **self.config.torch_compile_config) - else: - raise ValueError(f"Target {self.config.torch_compile_target} not supported") + raise ValueError(f"Target {self.config.torch_compile_target} not supported") + + def load_diffusers_pipeline_from_pretrained(self) -> None: + self.pretrained_model = self.automodel_loader.from_pretrained( + self.config.model, + # pretrained_model_name_or_path=self.config.model, + # pretrained_model_or_path=self.config.model, + device_map=self.config.device_map, + **self.config.model_kwargs, + **self.automodel_kwargs, + ) + if self.config.device_map is None and self.config.device != "cpu": + self.logger.info(f"\t+ Moving Diffusion Pipeline to device: {self.config.device}") + self.pretrained_model = self.pretrained_model.to(self.config.device) - def validate_library(self) -> None: - if self.config.library == "timm": - self.logger.info(f"\t+ Using Timm's {self.automodel_class.__name__}") - elif self.config.library == "diffusers": - self.logger.info(f"\t+ Using Diffusers Pipeline {self.automodel_class.__name__}") - elif self.config.library == "transformers": - self.logger.info(f"\t+ Using AutoModel {self.automodel_class.__name__}") - else: - raise ValueError(f"Library {self.config.library} not supported") + def load_diffusers_model(self): + self.logger.info("\t+ Loading Diffusion Pipeline") + self.logger.info(f"\t+ Using Diffusers Pipeline {self.automodel_loader.__name__}") - def load_model_from_pretrained(self) -> None: - if self.config.library == "timm": - self.logger.info("\t+ Loading Timm model") - self.pretrained_model = self.automodel_class(model_name=self.config.model) - if self.config.device != "cpu": - self.logger.info(f"\t+ Moving Timm model to device: {self.config.device}") - self.pretrained_model = self.pretrained_model.to(self.config.device) + # Model loading + if self.config.no_weights: + raise ValueError("No weights model not supported for Diffusers") + else: + self.load_diffusers_pipeline_from_pretrained() - elif self.config.library == "diffusers": - self.logger.info("\t+ Loading Diffusion Pipeline") - self.pretrained_model = self.automodel_class.from_pretrained( - # pretrained_model_name_or_path=self.config.model, - # pretrained_model_or_path=self.config.model, - self.config.model, - device_map=self.config.device_map, - **self.config.model_kwargs, - **self.automodel_kwargs, + # Torch compile + if self.config.torch_compile: + self.logger.info("\t+ Using torch.compile on unet and vae") + self.pretrained_model.unet = torch.compile(self.pretrained_model.unet, **self.config.torch_compile_config) + self.pretrained_model.vae.decode = torch.compile( + self.pretrained_model.vae.decode, **self.config.torch_compile_config ) - if self.config.device_map is None and self.config.device != "cpu": - self.logger.info(f"\t+ Moving Diffusion Pipeline to device: {self.config.device}") - self.pretrained_model = self.pretrained_model.to(self.config.device) - elif self.is_quantized: - self.logger.info(f"\t+ Loading {self.quantization_config.quant_method}-quantized model") - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.config.model, - device_map=self.config.device_map or torch.device(self.config.device), - # quantized models are more compatible with device_map dispatcher than (to(device)) - # using to(device) on quantized models sometimes leaves some layers on cpu or raises - # an error because the layers are already on the device - **self.config.model_kwargs, - **self.automodel_kwargs, - ) + def load_timm_model_form_pretrained(self) -> None: + self.pretrained_model = self.automodel_loader(model_name=self.config.model) + if self.config.device != "cpu": + self.logger.info(f"\t+ Moving Timm model to device: {self.config.device}") + self.pretrained_model = self.pretrained_model.to(self.config.device) - elif self.config.device_map is not None: - self.logger.info(f"\t+ Loading Transformers model with device map: {self.config.device_map}") - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.config.model, - device_map=self.config.device_map, - **self.config.model_kwargs, - **self.automodel_kwargs, - ) + def load_timm_model(self): + self.logger.info("\t+ Loading Timm model") + self.logger.info(f"\t+ Using Timm's {self.automodel_loader.__name__}") + # Model loading + if self.config.no_weights: + raise ValueError("No weights model not supported for Timm") else: - self.logger.info("\t+ Loading Transformers model") - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.config.model, **self.config.model_kwargs, **self.automodel_kwargs - ) - if self.config.device != "cpu": - self.logger.info(f"\t+ Moving Transformers model to device: {self.config.device}") - self.pretrained_model = self.pretrained_model.to(self.config.device) + self.load_timm_model_form_pretrained() + + # Torch compile + if self.config.torch_compile: + if self.config.torch_compile_target == "forward": + self.logger.info("\t+ Using torch.compile on forward") + self.pretrained_model.forward = torch.compile( + self.pretrained_model.forward, **self.config.torch_compile_config + ) + elif self.config.torch_compile_target == "model": + self.logger.info("\t+ Using torch.compile on model") + self.pretrained_model = torch.compile(self.pretrained_model, **self.config.torch_compile_config) + else: + raise ValueError(f"Target {self.config.torch_compile_target} not supported") def create_no_weights_model(self) -> None: if self.pretrained_config is None: @@ -209,8 +265,8 @@ def create_no_weights_model(self) -> None: if self.is_exllamav2: self.logger.info("\t+ Adding g_idx to no weights model state dict") - with torch.device("meta"): - meta_model = self.automodel_class.from_config(self.pretrained_config) + with init_empty_weights(include_buffers=False): + meta_model = self.automodel_loader.from_config(self.pretrained_config) for name, module in meta_model.named_modules(): if hasattr(module, "in_features"): state_dict[name + ".g_idx"] = torch.ones((module.in_features,), dtype=torch.int32) @@ -227,38 +283,6 @@ def create_no_weights_model(self) -> None: self.logger.info("\t+ Saving no weights model pretrained config") self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - def load_model_with_no_weights(self) -> None: - self.create_no_weights_model() - original_model, self.config.model = self.config.model, self.no_weights_model - - if self.config.deepspeed_inference: - with torch.device("meta"): - # with big models, loading no_weights_model is very slow (randomizing every weight) - # so we load the model on meta device to speed up the process and then move it to cpu - self.logger.info("\t+ Loading Transformers model on meta device for fast initialization") - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.config.model, - **self.config.model_kwargs, - **self.automodel_kwargs, - ) - self.logger.info("\t+ Materializing meta model on CPU to avoid OOM") - self.pretrained_model.to_empty(device="cpu") - - elif not self.is_quantized and self.config.device_map is None: - with torch.device(self.config.device): - self.logger.info("\t+ Loading Transformers model using device context manager for fast initialization") - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.no_weights_model, - **self.config.model_kwargs, - **self.automodel_kwargs, - ) - - else: - with random_init_weights(): - self.load_model_from_pretrained() - - self.config.model = original_model - def process_quantization_config(self) -> None: if self.is_gptq_quantized: self.logger.info("\t+ Processing GPTQ config") @@ -353,31 +377,34 @@ def automodel_kwargs(self) -> Dict[str, Any]: kwargs["low_cpu_mem_usage"] = self.config.low_cpu_mem_usage if self.config.no_weights: - # we use our own context manager to load the model with random weights + # we use our own context manager to load the + # model with faster random weights generators kwargs["_fast_init"] = False return kwargs - def prepare_inputs( - self, inputs: Dict[str, Any], input_shapes: Dict[str, Any] - ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - inputs, input_shapes = super().prepare_inputs(inputs, input_shapes) - + def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]: if self.is_dp_distributed: if input_shapes["batch_size"] % torch.distributed.get_world_size() != 0: raise ValueError( - f"Batch size {input_shapes['batch_size']} must be divisible by data parallel " - f"world size {torch.distributed.get_world_size()}" + f"Batch size {input_shapes['batch_size']} must be divisible by " + f"data parallel world size {torch.distributed.get_world_size()}" ) - with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as split_inputs: - input_shapes["batch_size"] = input_shapes["batch_size"] // torch.distributed.get_world_size() - inputs = split_inputs + # distributing batch size across processes + input_shapes["batch_size"] //= torch.distributed.get_world_size() if self.is_tp_distributed: if torch.distributed.get_rank() != 0: - # this is to force throughput of non main shards to 0 + # zeroing throughput on other ranks input_shapes["batch_size"] = 0 + return input_shapes + + def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + if self.is_dp_distributed: + with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as process_inputs: + inputs = process_inputs + if self.config.library == "timm": inputs = {"x": inputs["pixel_values"]} @@ -385,7 +412,7 @@ def prepare_inputs( if isinstance(value, torch.Tensor): inputs[key] = value.to(self.config.device) - return inputs, input_shapes + return inputs @torch.inference_mode() def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: @@ -393,6 +420,9 @@ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict @torch.inference_mode() def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: + assert ( + kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1 + ), "For prefilling, max_new_tokens and min_new_tokens must be equal to 1" return self.pretrained_model.generate(**inputs, **kwargs) @torch.inference_mode() diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py index a4bc1c39..a05187c3 100644 --- a/optimum_benchmark/backends/tensorrt_llm/backend.py +++ b/optimum_benchmark/backends/tensorrt_llm/backend.py @@ -1,11 +1,8 @@ -import os from collections import OrderedDict from tempfile import TemporaryDirectory from typing import Any, Dict -import torch from hydra.utils import get_class -from safetensors.torch import save_file from ..base import Backend from .config import TRTLLMConfig @@ -17,37 +14,25 @@ class TRTLLMBackend(Backend[TRTLLMConfig]): def __init__(self, config: TRTLLMConfig): super().__init__(config) - self.validate_model_type() + if self.config.model_type in MODEL_TYPE_TO_TRTLLMMODEL: + self.trtllm_loader = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.config.model_type]) + self.logger.info(f"\t+ Using TRTLLMModel class {self.trtllm_loader.__name__}") + else: + raise NotImplementedError(f"TRTLLMBackend does not support model_type {self.config.model_type}") + + def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() self.logger.info("\t+ Loading pretrained TRTLLMModel") self.load_trtmodel_from_pretrained() + self.logger.info("\t+ Cleaning up backend temporary directory") self.tmpdir.cleanup() - def validate_model_type(self) -> None: - if self.model_type not in MODEL_TYPE_TO_TRTLLMMODEL: - raise NotImplementedError(f"TRTLLMBackend does not support model_type {self.model_type}") - - self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.model_type]) - self.logger.info(f"\t+ Using TRTLLMModel class {self.trtmodel_class.__name__}") - - def create_no_weights_model(self) -> None: - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - self.logger.info("\t+ Creating no weights model state dict") - state_dict = torch.nn.Linear(1, 1).state_dict() - self.logger.info("\t+ Saving no weights model safetensors") - safetensors = os.path.join(self.no_weights_model, "model.safetensors") - save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) - - if self.config.library == "transformers": - self.logger.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - def load_trtmodel_from_pretrained(self) -> None: - self.pretrained_model = self.trtmodel_class.from_pretrained( + self.pretrained_model = self.trtllm_loader.from_pretrained( self.config.model, tp=self.config.tp, pp=self.config.pp, diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py index 22a017f9..77ed3000 100644 --- a/optimum_benchmark/backends/timm_utils.py +++ b/optimum_benchmark/backends/timm_utils.py @@ -5,18 +5,19 @@ from ..import_utils import is_timm_available if is_timm_available(): - import timm # type: ignore + from timm import create_model + from timm.models import get_pretrained_cfg, load_model_config_from_hf, parse_model_name def get_timm_pretrained_config(model_name: str) -> PretrainedConfig: - model_source, model_name = timm.models.parse_model_name(model_name) + model_source, model_name = parse_model_name(model_name) if model_source == "hf-hub": # For model names specified in the form `hf-hub:path/architecture_name@revision`, # load model weights + pretrained_cfg from Hugging Face hub. - pretrained_cfg, model_name = timm.models.load_model_config_from_hf(model_name) + pretrained_cfg, model_name = load_model_config_from_hf(model_name) return pretrained_cfg - return timm.get_pretrained_cfg(model_name) + return get_pretrained_cfg(model_name) def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]: @@ -70,3 +71,7 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]: shapes["num_labels"] = num_classes return shapes + + +def get_timm_automodel_loader(): + return create_model diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py index 5f915001..61401a75 100644 --- a/optimum_benchmark/backends/torch_ort/backend.py +++ b/optimum_benchmark/backends/torch_ort/backend.py @@ -1,16 +1,14 @@ -import os from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List import torch from datasets import Dataset from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments -from safetensors.torch import save_file from transformers import TrainerCallback from ..base import Backend from ..peft_utils import apply_peft -from ..transformers_utils import random_init_weights +from ..transformers_utils import fast_weights_init from .config import TorchORTConfig @@ -19,12 +17,14 @@ class TorchORTBackend(Backend[TorchORTConfig]): def __init__(self, config: TorchORTConfig): super().__init__(config) - self.validate_library() + def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() if self.config.no_weights: + self.logger.info("\t+ Creating no weights AutoModel") + self.create_no_weights_model() self.logger.info("\t+ Loading no weights AutoModel") self.load_automodel_with_no_weights() else: @@ -35,43 +35,22 @@ def __init__(self, config: TorchORTConfig): self.logger.info("\t+ Applying PEFT") self.pretrained_model = apply_peft(self.pretrained_model, self.config.peft_type, self.config.peft_config) + self.logger.info("\t+ Cleaning up backend temporary directory") self.tmpdir.cleanup() - def validate_library(self) -> None: - if self.config.library == "transformers": - self.logger.info(f"Using AutoModel class {self.automodel_class.__name__}") - else: - raise NotImplementedError(f"TorchORTBackend does not support {self.config.library} library") - - def create_no_weights_model(self) -> None: - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - self.logger.info("\t+ Creating no weights model directory") - os.makedirs(self.no_weights_model, exist_ok=True) - self.logger.info("\t+ Creating no weights model state dict") - state_dict = torch.nn.Linear(1, 1).state_dict() - self.logger.info("\t+ Saving no weights model safetensors") - safetensors = os.path.join(self.no_weights_model, "model.safetensors") - save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) - - if self.config.library == "transformers": - self.logger.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - def load_automodel_with_no_weights(self) -> None: - self.logger.info("\t+ Creating no weights model") - self.create_no_weights_model() + original_model, self.config.model = self.config.model, self.no_weights_model - with random_init_weights(): - original_model, self.config.model = self.config.model, self.no_weights_model - self.logger.info("\t+ Loading no weights AutoModel") + with fast_weights_init(): self.load_automodel_from_pretrained() - self.config.model = original_model self.logger.info("\t+ Tying model weights") self.pretrained_model.tie_weights() + self.config.model = original_model + def load_automodel_from_pretrained(self) -> None: - self.pretrained_model = self.automodel_class.from_pretrained( + self.pretrained_model = self.automodel_loader.from_pretrained( self.config.model, **self.automodel_kwargs, **self.config.model_kwargs ).to(self.config.device) diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 2ae02100..87755e78 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -2,19 +2,70 @@ from typing import Any, Dict, Optional, Union import torch +import transformers from transformers import ( AutoConfig, + AutoFeatureExtractor, AutoProcessor, AutoTokenizer, FeatureExtractionMixin, GenerationConfig, ImageProcessingMixin, PretrainedConfig, - PreTrainedTokenizer, ProcessorMixin, + SpecialTokensMixin, ) -PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, PreTrainedTokenizer, ProcessorMixin] +from ..import_utils import is_torch_available + +TASKS_TO_MODEL_LOADERS = { + # text processing + "feature-extraction": "AutoModel", + "fill-mask": "AutoModelForMaskedLM", + "multiple-choice": "AutoModelForMultipleChoice", + "question-answering": "AutoModelForQuestionAnswering", + "token-classification": "AutoModelForTokenClassification", + "text-classification": "AutoModelForSequenceClassification", + # audio processing + "audio-xvector": "AutoModelForAudioXVector", + "text-to-audio": "AutoModelForTextToSpectrogram", + "audio-classification": "AutoModelForAudioClassification", + "audio-frame-classification": "AutoModelForAudioFrameClassification", + # image processing + "mask-generation": "AutoModel", + "image-to-image": "AutoModelForImageToImage", + "masked-im": "AutoModelForMaskedImageModeling", + "object-detection": "AutoModelForObjectDetection", + "depth-estimation": "AutoModelForDepthEstimation", + "image-segmentation": "AutoModelForImageSegmentation", + "image-classification": "AutoModelForImageClassification", + "semantic-segmentation": "AutoModelForSemanticSegmentation", + "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection", + "zero-shot-image-classification": "AutoModelForZeroShotImageClassification", + # text generation + "image-to-text": "AutoModelForVision2Seq", + "text-generation": "AutoModelForCausalLM", + "text2text-generation": "AutoModelForSeq2SeqLM", + "visual-question-answering": "AutoModelForVisualQuestionAnswering", + "automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"), +} + + +if is_torch_available(): + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {} + for task_name, model_loaders in TASKS_TO_MODEL_LOADERS.items(): + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name] = {} + + if isinstance(model_loaders, str): + model_loaders = (model_loaders,) + + for model_loader_name in model_loaders: + model_loader_class = getattr(transformers, model_loader_name) + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name].update(model_loader_class._model_mapping._model_mapping) +else: + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {} + +PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, SpecialTokensMixin, ProcessorMixin] def get_transformers_pretrained_config(model: str, **kwargs) -> "PretrainedConfig": @@ -36,9 +87,12 @@ def get_transformers_pretrained_processor(model: str, **kwargs) -> Optional["Pre return AutoProcessor.from_pretrained(model, **kwargs) except Exception: try: - return AutoTokenizer.from_pretrained(model, **kwargs) + return AutoFeatureExtractor.from_pretrained(model, **kwargs) except Exception: - return None + try: + return AutoTokenizer.from_pretrained(model, **kwargs) + except Exception: + return None def extract_transformers_shapes_from_artifacts( @@ -114,6 +168,12 @@ def extract_transformers_shapes_from_artifacts( return shapes +def get_transformers_automodel_loader_for_task(task: str): + model_loader_name = TASKS_TO_MODEL_LOADERS[task] + model_loader_class = getattr(transformers, model_loader_name) + return model_loader_class + + TORCH_INIT_FUNCTIONS = { "normal_": torch.nn.init.normal_, "uniform_": torch.nn.init.uniform_, @@ -131,20 +191,20 @@ def extract_transformers_shapes_from_artifacts( } -def fast_rand(tensor: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: +def fast_random_tensor(tensor: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: return torch.nn.init.uniform_(tensor) @contextmanager -def random_init_weights(): +def fast_weights_init(): # Replace the initialization functions for name, init_func in TORCH_INIT_FUNCTIONS.items(): - if name != "uniform_": - setattr(torch.nn.init, name, fast_rand) + if name != "uniform_": # avoid recursion + setattr(torch.nn.init, name, fast_random_tensor) try: yield finally: # Restore the original initialization functions for name, init_func in TORCH_INIT_FUNCTIONS.items(): - if name != "uniform_": + if name != "uniform_": # avoid recursion setattr(torch.nn.init, name, init_func) diff --git a/optimum_benchmark/backends/vllm/backend.py b/optimum_benchmark/backends/vllm/backend.py index a0833477..1a28de4a 100644 --- a/optimum_benchmark/backends/vllm/backend.py +++ b/optimum_benchmark/backends/vllm/backend.py @@ -1,6 +1,6 @@ import os from tempfile import TemporaryDirectory -from typing import Any, Dict, Tuple +from typing import Any, Dict import torch from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE @@ -9,7 +9,7 @@ from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend -from ..transformers_utils import random_init_weights +from ..transformers_utils import fast_weights_init from .config import VLLMConfig @@ -18,29 +18,34 @@ class VLLMBackend(Backend[VLLMConfig]): def __init__(self, config: VLLMConfig) -> None: super().__init__(config) - self.validate_task() + if self.config.task not in TEXT_GENERATION_TASKS: + raise NotImplementedError(f"vLLM does not support task {self.config.task}") + + def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() if self.config.no_weights: + self.logger.info("\t+ Creating no weights model") + self.create_no_weights_model() self.logger.info("\t+ Loading no weights model") self.load_model_with_no_weights() else: self.logger.info("\t+ Downloading pretrained model") self.download_pretrained_model() - - self.logger.info("\t+ Preparing generation config") - self.prepare_generation_config() - + if self.config.task in TEXT_GENERATION_TASKS: + self.logger.info("\t+ Preparing generation config") + self.prepare_generation_config() self.logger.info("\t+ Loading pretrained model") self.load_model_from_pretrained() + self.logger.info("\t+ Cleaning up backend temporary directory") self.tmpdir.cleanup() def download_pretrained_model(self) -> None: with torch.device("meta"): - self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs) + self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs) def prepare_generation_config(self) -> None: self.generation_config.eos_token_id = None @@ -69,8 +74,8 @@ def create_no_weights_model(self) -> None: self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) # unlike Transformers, vLLM won't accept any missing tensors so we need to materialize the model self.logger.info(f"\t+ Loading no weights model from {self.no_weights_model}") - with random_init_weights(): - self.pretrained_model = self.automodel_class.from_pretrained( + with fast_weights_init(): + self.pretrained_model = self.automodel_loader.from_pretrained( self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False ) self.logger.info("\t+ Saving no weights model") @@ -82,14 +87,10 @@ def create_no_weights_model(self) -> None: self.logger.info("\t+ Modifying generation config for fixed length generation") self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None - self.logger.info("\t+ Saving new pretrained generation config") self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: - self.logger.info("\t+ Creating no weights model") - self.create_no_weights_model() - original_model, self.config.model = self.config.model, self.no_weights_model self.logger.info("\t+ Loading no weights model") self.load_model_from_pretrained() @@ -125,21 +126,13 @@ def load_model_from_pretrained(self) -> None: seed=self.config.seed, ) - def validate_task(self) -> None: - if self.config.task not in ["text-generation"]: - raise ValueError(f"Task {self.config.task} not supported by {self.NAME}") - - def prepare_inputs( - self, inputs: Dict[str, Any], input_shapes: Dict[str, Any] - ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - inputs, input_shapes = super().prepare_inputs(inputs, input_shapes) - + def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: if self.config.task in TEXT_GENERATION_TASKS: inputs = {"prompts": self.pretrained_processor.batch_decode(inputs["input_ids"])} else: raise NotImplementedError(f"vLLM does not support task {self.config.task}") - return inputs, input_shapes + return inputs def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any: return self.pretrained_model.generate( @@ -163,6 +156,7 @@ def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Dict[str, A use_tqdm=False, sampling_params=SamplingParams( ignore_eos=True, + detokenize=True, seed=self.config.seed, n=kwargs.get("num_return_sequences"), max_tokens=kwargs.get("max_new_tokens"), @@ -178,6 +172,7 @@ def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any: use_tqdm=False, sampling_params=SamplingParams( ignore_eos=True, + detokenize=True, n=kwargs.get("num_return_sequences"), max_tokens=kwargs.get("max_new_tokens"), min_tokens=kwargs.get("min_new_tokens"), diff --git a/optimum_benchmark/launchers/process/launcher.py b/optimum_benchmark/launchers/process/launcher.py index 2067a841..fbc990e1 100644 --- a/optimum_benchmark/launchers/process/launcher.py +++ b/optimum_benchmark/launchers/process/launcher.py @@ -21,11 +21,12 @@ def __init__(self, config: ProcessConfig): if get_start_method(allow_none=True) != self.config.start_method: self.logger.info(f"\t+ Setting multiprocessing start method to {self.config.start_method}") set_start_method(self.config.start_method, force=True) - self.logger.info("\t+ Warming up multiprocessing context") # creates the resource tracker with default executable - dummy_process = Process() + self.logger.info("\t+ Warming up multiprocessing context") + dummy_process = Process(target=dummy_target, daemon=False) dummy_process.start() dummy_process.join() + dummy_process.close() def launch(self, worker: Callable[..., BenchmarkReport], worker_args: List[Any]) -> BenchmarkReport: child_connection, parent_connection = Pipe() @@ -110,3 +111,7 @@ def target( logger.info("\t+ Exiting isolated process") connection.close() exit(0) + + +def dummy_target() -> None: + exit(0) diff --git a/optimum_benchmark/scenarios/energy_star/scenario.py b/optimum_benchmark/scenarios/energy_star/scenario.py index fbb4c1ed..3bf003ff 100644 --- a/optimum_benchmark/scenarios/energy_star/scenario.py +++ b/optimum_benchmark/scenarios/energy_star/scenario.py @@ -144,11 +144,12 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport: LOGGER.info("\t+ Preparing backend for Inference") backend.prepare_for_inference( - **backend.model_shapes, - **self.config.input_shapes, - **self.config.generate_kwargs, - **self.config.forward_kwargs, - **self.config.call_kwargs, + input_shapes=self.config.input_shapes, + inference_kwargs={ + **self.config.generate_kwargs, + **self.config.forward_kwargs, + **self.config.call_kwargs, + }, ) LOGGER.info("\t+ Warming up backend for Inference") diff --git a/optimum_benchmark/scenarios/inference/config.py b/optimum_benchmark/scenarios/inference/config.py index 3ceb8895..2c05d97f 100644 --- a/optimum_benchmark/scenarios/inference/config.py +++ b/optimum_benchmark/scenarios/inference/config.py @@ -18,15 +18,23 @@ class InferenceConfig(ScenarioConfig): # benchmark options iterations: int = field( default=10, - metadata={"help": "Minimum number of iterations to run the benchmark, set to 0 to disable this constraint"}, + metadata={ + "help": "Minimum number of iterations to run the benchmark. " + "The number of tracked inferences will be at least this value." + "Set to 0 to disable this constraint (benchmark will run for `duration` seconds)." + }, ) duration: int = field( default=10, - metadata={"help": "Minimum duration of the benchmark in seconds, set to 0 to disable this constraint"}, + metadata={ + "help": "Minimum duration of the benchmark in seconds. " + "The sum of tracked inferences will be at least this value." + "Set to 0 to disable this constraint (benchmark will run for `iterations` iterations)." + }, ) warmup_runs: int = field( default=10, - metadata={"help": "Number of warmup runs to perform before benchmarking, set to 0 to disable warmup"}, + metadata={"help": "Number of warmup runs to perform before benchmarking."}, ) # input/output config @@ -40,8 +48,8 @@ class InferenceConfig(ScenarioConfig): ) # tracking options - latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"}) memory: bool = field(default=False, metadata={"help": "Measure max memory usage"}) + latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"}) energy: bool = field(default=False, metadata={"help": "Measure energy usage and efficiency"}) # methods kwargs diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py index 2d327df1..c0d9475e 100644 --- a/optimum_benchmark/scenarios/inference/scenario.py +++ b/optimum_benchmark/scenarios/inference/scenario.py @@ -1,4 +1,5 @@ import time +from contextlib import ExitStack from transformers import LogitsProcessorList @@ -68,52 +69,27 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport: self.logger.info("\t+ Updating Text Generation kwargs with default values") self.config.generate_kwargs = {**TEXT_GENERATION_DEFAULT_KWARGS, **self.config.generate_kwargs} self.logger.info("\t+ Initializing Text Generation report") - - self.report = BenchmarkReport.from_list(targets=["prefill", "decode", "per_token"]) - + self.report = BenchmarkReport.from_list(targets=["load", "prefill", "decode", "per_token"]) elif backend.config.task in IMAGE_DIFFUSION_TASKS: self.logger.info("\t+ Generating Image Diffusion inputs") self.inputs = self.input_generator() self.logger.info("\t+ Updating Image Diffusion kwargs with default values") self.config.call_kwargs = {**IMAGE_DIFFUSION_DEFAULT_KWARGS, **self.config.call_kwargs} self.logger.info("\t+ Initializing Image Diffusion report") - self.report = BenchmarkReport.from_list(targets=["call"]) - + self.report = BenchmarkReport.from_list(targets=["load", "call"]) else: self.logger.info("\t+ Generating Inference inputs") self.inputs = self.input_generator() self.logger.info("\t+ Initializing Inference report") - self.report = BenchmarkReport.from_list(targets=["forward"]) + self.report = BenchmarkReport.from_list(targets=["load", "forward"]) - self.logger.info("\t+ Preparing inputs for Inference") - self.inputs, self.config.input_shapes = backend.prepare_inputs( - inputs=self.inputs, input_shapes=self.config.input_shapes - ) + self.logger.info("\t+ Preparing input shapes for Inference") + self.config.input_shapes = backend.prepare_input_shapes(input_shapes=self.config.input_shapes) - self.logger.info("\t+ Preparing backend for Inference") - backend.prepare_for_inference( - input_shapes=self.config.input_shapes, - inference_kwargs={ - **self.config.generate_kwargs, - **self.config.forward_kwargs, - **self.config.call_kwargs, - }, - ) + self.run_model_loading_tracking(backend) - if backend.config.task in TEXT_GENERATION_TASKS: - self.logger.info("\t+ Warming up backend for Text Generation") - _ = backend.generate(self.inputs, self.config.generate_kwargs) - for _ in range(self.config.warmup_runs): - _ = backend.generate(self.inputs, {**self.config.generate_kwargs, **TEXT_GENERATION_WARMUP_OVERRIDES}) - elif backend.config.task in IMAGE_DIFFUSION_TASKS: - self.logger.info("\t+ Warming up backend for Image Diffusion") - _ = backend.call(self.inputs, self.config.call_kwargs) - for _ in range(self.config.warmup_runs): - _ = backend.call(self.inputs, {**self.config.call_kwargs, **IMAGE_DIFFUSION_WARMUP_OVERRIDES}) - else: - self.logger.info("\t+ Warming up backend for Inference") - for _ in range(self.config.warmup_runs): - _ = backend.forward(self.inputs, self.config.forward_kwargs) + self.logger.info("\t+ Preparing inputs for Inference") + self.inputs = backend.prepare_inputs(inputs=self.inputs) if self.config.memory: if backend.config.task in TEXT_GENERATION_TASKS: @@ -125,6 +101,15 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport: self.report.log_memory() + if self.config.latency or self.config.energy: + # latency and energy are metrics that require some warmup + if backend.config.task in TEXT_GENERATION_TASKS: + self.warmup_text_generation(backend) + elif backend.config.task in IMAGE_DIFFUSION_TASKS: + self.warmup_image_diffusion(backend) + else: + self.warmup_inference(backend) + if self.config.latency: if backend.config.task in TEXT_GENERATION_TASKS: if backend.config.name in PER_TOKEN_BACKENDS: @@ -152,6 +137,57 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport: return self.report + def warmup_text_generation(self, backend: Backend[BackendConfigT]): + self.logger.info("\t+ Warming up backend for Text Generation") + _ = backend.generate(self.inputs, self.config.generate_kwargs) + for _ in range(self.config.warmup_runs): + _ = backend.generate(self.inputs, {**self.config.generate_kwargs, **TEXT_GENERATION_WARMUP_OVERRIDES}) + + def warmup_image_diffusion(self, backend: Backend[BackendConfigT]): + self.logger.info("\t+ Warming up backend for Image Diffusion") + _ = backend.call(self.inputs, self.config.call_kwargs) + for _ in range(self.config.warmup_runs): + _ = backend.call(self.inputs, {**self.config.call_kwargs, **IMAGE_DIFFUSION_WARMUP_OVERRIDES}) + + def warmup_inference(self, backend: Backend[BackendConfigT]): + self.logger.info("\t+ Warming up backend for Inference") + for _ in range(self.config.warmup_runs): + _ = backend.forward(self.inputs, self.config.forward_kwargs) + + # Loading tracking + def run_model_loading_tracking(self, backend: Backend[BackendConfigT]): + self.logger.info("\t+ Running model loading tracking") + + if self.config.latency: + latency_tracker = LatencyTracker(backend=backend.config.name, device=backend.config.device) + if self.config.memory: + memory_tracker = MemoryTracker( + backend=backend.config.name, device=backend.config.device, device_ids=backend.config.device_ids + ) + if self.config.energy: + energy_tracker = EnergyTracker( + backend=backend.config.name, device=backend.config.device, device_ids=backend.config.device_ids + ) + + context_stack = ExitStack() + if self.config.latency: + context_stack.enter_context(latency_tracker.track()) + if self.config.memory: + context_stack.enter_context(memory_tracker.track()) + if self.config.energy: + context_stack.enter_context(energy_tracker.track()) + + with context_stack: + self.logger.info("\t+ Loading model for Inference") + backend.load() + + if self.config.latency: + self.report.load.latency = latency_tracker.get_latency() + if self.config.memory: + self.report.load.memory = memory_tracker.get_max_memory() + if self.config.energy: + self.report.load.energy = energy_tracker.get_energy() + ## Memory tracking def run_text_generation_memory_tracking(self, backend: Backend[BackendConfigT]): self.logger.info("\t+ Running Text Generation memory tracking") @@ -272,7 +308,7 @@ def run_image_diffusion_latency_tracking(self, backend: Backend[BackendConfigT]) ) def run_latency_inference_tracking(self, backend: Backend[BackendConfigT]): - self.logger.info("\t+ Running latency tracking") + self.logger.info("\t+ Running Inference latency tracking") latency_tracker = LatencyTracker(backend=backend.config.name, device=backend.config.device) while latency_tracker.elapsed() < self.config.duration or latency_tracker.count() < self.config.iterations: diff --git a/optimum_benchmark/scenarios/training/scenario.py b/optimum_benchmark/scenarios/training/scenario.py index e7fc67fe..d42fc269 100644 --- a/optimum_benchmark/scenarios/training/scenario.py +++ b/optimum_benchmark/scenarios/training/scenario.py @@ -33,29 +33,38 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport: self.logger.info("\t+ Initializing training report") self.report = BenchmarkReport.from_list(targets=["overall", "warmup", "train"]) + self.logger.info("\t+ Loading model into backend") + backend.load() + training_callbackes = [] + if self.config.latency: - self.logger.info("\t+ Adding latency measuring callback") + self.logger.info("\t+ Creating latency tracking callback") latency_callback = StepLatencyTrainerCallback(device=backend.config.device, backend=backend.config.name) + self.logger.info("\t+ Adding latency measuring callback") training_callbackes.append(latency_callback) - training_trackers = [] + context_stack = ExitStack() + if self.config.memory: - self.logger.info("\t+ Adding memory tracking context manager") + self.logger.info("\t+ Creating memory tracking context manager") memory_tracker = MemoryTracker( device=backend.config.device, backend=backend.config.name, device_ids=backend.config.device_ids ) - training_trackers.append(memory_tracker.track()) if self.config.energy: - self.logger.info("\t+ Adding energy tracking context manager") + self.logger.info("\t+ Creating energy tracking context manager") energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids) - training_trackers.append(energy_tracker.track()) - with ExitStack() as stack: - for tracker in training_trackers: - stack.enter_context(tracker) + if self.config.memory: + self.logger.info("\t+ Entering memory tracking context manager") + context_stack.enter_context(memory_tracker.track()) + + if self.config.energy: + self.logger.info("\t+ Entering energy tracking context manager") + context_stack.enter_context(energy_tracker.track()) + with context_stack: backend.train( training_dataset=training_dataset, training_callbacks=training_callbackes, diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py index cf1701b5..4587097f 100644 --- a/optimum_benchmark/task_utils.py +++ b/optimum_benchmark/task_utils.py @@ -1,59 +1,24 @@ import importlib +import json import os from typing import Optional import huggingface_hub -_TRANSFORMERS_TASKS_TO_MODEL_LOADERS = { - # text processing - "feature-extraction": "AutoModel", - "fill-mask": "AutoModelForMaskedLM", - "multiple-choice": "AutoModelForMultipleChoice", - "question-answering": "AutoModelForQuestionAnswering", - "token-classification": "AutoModelForTokenClassification", - "text-classification": "AutoModelForSequenceClassification", - # audio processing - "audio-xvector": "AutoModelForAudioXVector", - "text-to-audio": "AutoModelForTextToSpectrogram", - "audio-classification": "AutoModelForAudioClassification", - "audio-frame-classification": "AutoModelForAudioFrameClassification", - "conversational": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"), - # image processing - "mask-generation": "AutoModel", - "image-to-image": "AutoModelForImageToImage", - "masked-im": "AutoModelForMaskedImageModeling", - "object-detection": "AutoModelForObjectDetection", - "depth-estimation": "AutoModelForDepthEstimation", - "image-classification": "AutoModelForImageClassification", - "semantic-segmentation": "AutoModelForSemanticSegmentation", - "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection", - "zero-shot-image-classification": "AutoModelForZeroShotImageClassification", - "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"), - # text generation - "image-to-text": "AutoModelForVision2Seq", - "text-generation": "AutoModelForCausalLM", - "text2text-generation": "AutoModelForSeq2SeqLM", - "visual-question-answering": "AutoModelForVisualQuestionAnswering", - "automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"), -} - -_DIFFUSERS_TASKS_TO_MODEL_LOADERS = { - "inpainting": "AutoPipelineForInpainting", - "text-to-image": "AutoPipelineForText2Image", - "image-to-image": "AutoPipelineForImage2Image", - "stable-diffusion": "StableDiffusionPipeline", # should be deprecated - "stable-diffusion-xl": "StableDiffusionXLImg2ImgPipeline", # should be deprecated -} -_TIMM_TASKS_TO_MODEL_LOADERS = { - "image-classification": "create_model", -} - - -_LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP = { - "timm": _TIMM_TASKS_TO_MODEL_LOADERS, - "diffusers": _DIFFUSERS_TASKS_TO_MODEL_LOADERS, - "transformers": _TRANSFORMERS_TASKS_TO_MODEL_LOADERS, -} +from .backends.diffusers_utils import ( + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES as DIFFUSERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES, +) +from .backends.diffusers_utils import ( + get_diffusers_pretrained_config, +) +from .backends.timm_utils import get_timm_pretrained_config +from .backends.transformers_utils import ( + TASKS_TO_MODEL_LOADERS, + get_transformers_pretrained_config, +) +from .backends.transformers_utils import ( + TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES as TRANSFORMERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES, +) _SYNONYM_TASK_MAP = { "masked-lm": "fill-mask", @@ -70,18 +35,12 @@ "speech2seq-lm": "automatic-speech-recognition", "sequence-classification": "text-classification", "zero-shot-classification": "text-classification", - "causal-lm-with-past": "text-generation-with-past", - "default-with-past": "feature-extraction-with-past", - "seq2seq-lm-with-past": "text2text-generation-with-past", - "speech2seq-lm-with-past": "automatic-speech-recognition-with-past", } IMAGE_DIFFUSION_TASKS = [ "inpainting", "text-to-image", "image-to-image", - "stable-diffusion", - "stable-diffusion-xl", ] TEXT_GENERATION_TASKS = [ @@ -93,7 +52,6 @@ ] TEXT_EMBEDDING_TASKS = [ - "fill-mask", "feature-extraction", ] @@ -105,70 +63,104 @@ def map_from_synonym(task: str) -> str: def infer_library_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str: - is_local = os.path.isdir(model_name_or_path) + inferred_library_name = None - if is_local: - raise RuntimeError("Cannot infer the library from a local directory yet, please specify the library manually.") + if huggingface_hub.repo_exists(model_name_or_path): + model_info = huggingface_hub.model_info(model_name_or_path, revision=revision) + inferred_library_name = getattr(model_info, "library_name", None) - model_info = huggingface_hub.model_info(model_name_or_path, revision=revision) + if inferred_library_name == "sentence-transformers": + inferred_library_name = "transformers" - inferred_library_name = getattr(model_info, "library_name", None) + if inferred_library_name is None: + raise RuntimeError(f"Could not infer library name from repo {model_name_or_path}.") - if inferred_library_name is None: - raise KeyError(f"Could not find the proper library name for {model_name_or_path}.") + elif os.path.isdir(model_name_or_path): + local_files = os.listdir(model_name_or_path) - if inferred_library_name == "sentence-transformers": - inferred_library_name = "transformers" + if "model_index.json" in local_files: + inferred_library_name = "diffusers" + elif "config.json" in local_files: + config_dict = json.load(open(os.path.join(model_name_or_path, "config.json"), "r")) + if "pretrained_cfg" in config_dict or "architecture" in config_dict: + inferred_library_name = "timm" + elif "_diffusers_version" in config_dict: + inferred_library_name = "diffusers" + else: + inferred_library_name = "transformers" - return inferred_library_name + if inferred_library_name is None: + raise KeyError(f"Could not find the proper library name for directory {model_name_or_path}.") + else: + raise KeyError( + f"Could not find the proper library name for {model_name_or_path}" + " because it's neither a repo nor a directory." + ) -# adapted from https://github.com/huggingface/optimum/blob/main/optimum/exporters/tasks.py without torch dependency -def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str: - is_local = os.path.isdir(model_name_or_path) + return inferred_library_name - if is_local: - raise RuntimeError("Cannot infer the task from a local directory yet, please specify the task manually.") - model_info = huggingface_hub.model_info(model_name_or_path, revision=revision) +def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str: library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision) + inferred_task_name = None + if library_name == "timm": inferred_task_name = "image-classification" - elif library_name == "sentence-transformers": inferred_task_name = "feature-extraction" - - elif library_name == "diffusers": - if "text-to-image" in model_info.tags: - inferred_task_name = "text-to-image" - elif "image-to-image" in model_info.tags: - inferred_task_name = "image-to-image" - elif "inpainting" in model_info.tags: - inferred_task_name = "inpainting" - else: - class_name = model_info.config["diffusers"]["class_name"] - inferred_task_name = "stable-diffusion-xl" if "XL" in class_name else "stable-diffusion" - - elif library_name == "transformers": - if model_info.pipeline_tag is not None: - inferred_task_name = map_from_synonym(model_info.pipeline_tag) - else: - pipeline_tag = model_info.transformersInfo.pipeline_tag - - if model_info.transformers_info is not None and pipeline_tag is not None: - inferred_task_name = map_from_synonym(pipeline_tag) + elif huggingface_hub.repo_exists(model_name_or_path): + model_info = huggingface_hub.model_info(model_name_or_path, revision=revision) + + if library_name == "diffusers": + if model_info.pipeline_tag is not None: + inferred_task_name = map_from_synonym(model_info.pipeline_tag) + elif library_name == "transformers": + if model_info.pipeline_tag is not None: + inferred_task_name = map_from_synonym(model_info.pipeline_tag) else: - auto_model_class_name = model_info.transformers_info["auto_model"] - tasks_to_automodels = _LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP[model_info.library_name] - for task_name, class_name_for_task in tasks_to_automodels.items(): - if class_name_for_task == auto_model_class_name: + if model_info.transformers_info is not None and model_info.transformersInfo.pipeline_tag is not None: + inferred_task_name = map_from_synonym(model_info.transformersInfo.pipeline_tag) + else: + auto_model_class_name = model_info.transformers_info["auto_model"] + for task_name, model_loaders in TASKS_TO_MODEL_LOADERS.items(): + if isinstance(model_loaders, str): + model_loaders = (model_loaders,) + for model_loader in model_loaders: + if auto_model_class_name == model_loader: + inferred_task_name = task_name + break + if inferred_task_name is not None: + break + elif os.path.isdir(model_name_or_path): + if library_name == "diffusers": + diffusers_config = get_diffusers_pretrained_config(model_name_or_path, revision=revision) + class_name = diffusers_config["_class_name"] + + for task_name, model_mapping in DIFFUSERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES.items(): + for model_type, model_class_name in model_mapping.items(): + if class_name == model_class_name: inferred_task_name = task_name break - inferred_task_name = None - - else: - raise NotImplementedError(f"Library {library_name} is not supported yet.") + if inferred_task_name is not None: + break + elif library_name == "transformers": + auto_modeling_module = importlib.import_module("transformers.models.auto.modeling_auto") + transformers_config = get_transformers_pretrained_config(model_name_or_path, revision=revision) + model_type = transformers_config.model_type + + for task_name, model_loaders in TRANSFORMERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES.items(): + if isinstance(model_loaders, str): + model_loaders = (model_loaders,) + for model_loader in model_loaders: + model_loader_class = getattr(auto_modeling_module, model_loader) + model_mapping = model_loader_class._model_mapping._model_mapping + if model_type in model_mapping: + inferred_task_name = task_name + break + if inferred_task_name is not None: + break if inferred_task_name is None: raise KeyError(f"Could not find the proper task name for {auto_model_class_name}.") @@ -176,52 +168,36 @@ def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Option return inferred_task_name -# adapted from https://github.com/huggingface/optimum/blob/main/optimum/exporters/tasks.py without torch dependency -def get_automodel_class_for_task( - task: str, - auto_model_class_name: Optional[str] = None, - model_type: Optional[str] = None, - library: str = "transformers", - framework: str = "pt", -): - task = map_from_synonym(task) - - if framework == "pt": - tasks_to_model_loader = _LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP[library] - elif framework == "jax": - raise NotImplementedError("JAX is not supported yet.") - elif framework == "tf": - raise NotImplementedError("TensorFlow is not supported yet.") - else: - raise NotImplementedError("Only PyTorch is supported for now.") - - loaded_library = importlib.import_module(library) - - if auto_model_class_name is None: - if task not in tasks_to_model_loader: - raise KeyError( - f"Unknown task: {task}. Possible values are: " - + ", ".join([f"`{key}` for {tasks_to_model_loader[key]}" for key in tasks_to_model_loader]) - ) - - if isinstance(tasks_to_model_loader[task], str): - inferred_auto_model_class_name = tasks_to_model_loader[task] - elif isinstance(tasks_to_model_loader[task], tuple): - if model_type is None: - inferred_auto_model_class_name = tasks_to_model_loader[task][0] - else: - for auto_class_name in tasks_to_model_loader[task]: - model_mapping = getattr(loaded_library, auto_class_name)._model_mapping._model_mapping +def infer_model_type_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str: + library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision) - if model_type in model_mapping or model_type.replace("-", "_") in model_mapping: - inferred_auto_model_class_name = auto_class_name - break + inferred_model_type = None - inferred_auto_model_class_name = None + if library_name == "timm": + timm_config = get_timm_pretrained_config(model_name_or_path) + inferred_model_type = timm_config.architecture + + elif library_name == "diffusers": + from diffusers import DiffusionPipeline + + config = DiffusionPipeline.load_config(model_name_or_path) + config, _ = config if isinstance(config, tuple) else (config, None) + class_name = config["_class_name"] + + for task_name, model_mapping in DIFFUSERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES.items(): + for model_type, model_class_name in model_mapping.items(): + if model_class_name == class_name: + inferred_model_type = model_type + break + if inferred_model_type is not None: + break + else: + from transformers import AutoConfig - if inferred_auto_model_class_name is None: - raise ValueError(f"Could not find the model class name for task {task}.") + config = AutoConfig.from_pretrained(model_name_or_path) + inferred_model_type = config.model_type - inferred_model_class = getattr(loaded_library, inferred_auto_model_class_name) + if inferred_model_type is None: + raise KeyError(f"Could not find the proper model type for {model_name_or_path}.") - return inferred_model_class + return inferred_model_type diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index 6b8d614f..c130d162 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -44,9 +44,6 @@ def __getitem__(self, index) -> float: raise ValueError(f"Invalid index type: {type(index)}, expected int or slice") def __sub__(self, latency: "Latency") -> "Latency": - if not isinstance(latency, Latency): - raise ValueError(f"Cannot subtract {type(latency)} from Latency") - latencies = [lat - latency.mean for lat in self.values] assert not any(latency < 0 for latency in latencies), "Negative latency detected" @@ -82,14 +79,14 @@ def from_values(values: List[float], unit: str) -> "Latency": def log(self, prefix: str = "method"): stdev_percentage = 100 * self.stdev / self.mean if self.mean > 0 else 0 LOGGER.info(f"\t\t+ {prefix} latency:") - LOGGER.info(f"\t\t\t+ count: {self.count}") - LOGGER.info(f"\t\t\t+ total: {self.total:f} {self.unit}") - LOGGER.info(f"\t\t\t+ mean: {self.mean:f} {self.unit}") - LOGGER.info(f"\t\t\t+ stdev: {self.stdev:f} {self.unit} ({stdev_percentage:.2f}%)") - LOGGER.info(f"\t\t\t+ p50: {self.p50:f} {self.unit}") - LOGGER.info(f"\t\t\t+ p90: {self.p90:f} {self.unit}") - LOGGER.info(f"\t\t\t+ p95: {self.p95:f} {self.unit}") - LOGGER.info(f"\t\t\t+ p99: {self.p99:f} {self.unit}") + LOGGER.info(f"\t\t\t- count: {self.count}") + LOGGER.info(f"\t\t\t- total: {self.total:f} {self.unit}") + LOGGER.info(f"\t\t\t- mean: {self.mean:f} {self.unit}") + LOGGER.info(f"\t\t\t- stdev: {self.stdev:f} {self.unit} ({stdev_percentage:.2f}%)") + LOGGER.info(f"\t\t\t- p50: {self.p50:f} {self.unit}") + LOGGER.info(f"\t\t\t- p90: {self.p90:f} {self.unit}") + LOGGER.info(f"\t\t\t- p95: {self.p95:f} {self.unit}") + LOGGER.info(f"\t\t\t- p99: {self.p99:f} {self.unit}") @dataclass diff --git a/tests/configs/_bert_.yaml b/tests/configs/_bert_.yaml index a9b5a38a..e54d2925 100644 --- a/tests/configs/_bert_.yaml +++ b/tests/configs/_bert_.yaml @@ -1,2 +1,3 @@ backend: model: google-bert/bert-base-uncased + task: feature-extraction diff --git a/tests/configs/_diffusers_.yaml b/tests/configs/_diffusers_.yaml index 0f8e4d27..607b2502 100644 --- a/tests/configs/_diffusers_.yaml +++ b/tests/configs/_diffusers_.yaml @@ -1,4 +1,4 @@ backend: library: diffusers - task: stable-diffusion + task: text-to-image model: hf-internal-testing/tiny-stable-diffusion-torch diff --git a/tests/configs/cuda_inference_py_txi_bert.yaml b/tests/configs/cuda_inference_py_txi_bert.yaml index 68c726c5..62405f30 100644 --- a/tests/configs/cuda_inference_py_txi_bert.yaml +++ b/tests/configs/cuda_inference_py_txi_bert.yaml @@ -3,7 +3,7 @@ defaults: - _base_ # inherits from base config - _cuda_ # inherits from cuda config - _inference_ # inherits from inference config - - _bert_ # inherits from gpt config + - _bert_ # inherits from bert config - _self_ # hydra 1.1 compatibility - override backend: py-txi diff --git a/tests/test_api.py b/tests/test_api.py index c54fb075..56a44079 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -37,7 +37,7 @@ ("transformers", "text-classification", "FacebookAI/roberta-base"), ("transformers", "token-classification", "microsoft/deberta-v3-base"), ("transformers", "image-classification", "google/vit-base-patch16-224"), - ("diffusers", "stable-diffusion", "CompVis/stable-diffusion-v1-4"), + ("diffusers", "text-to-image", "CompVis/stable-diffusion-v1-4"), ] From 6351e36d37f4bc6aba5b9a4e7bac79b79cc14838 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 15 Jul 2024 17:15:44 +0200 Subject: [PATCH 5/6] Update readme (#228) --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 083b5221..780b4fbb 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices *News* 📰 - 🥳 PyPI package is now available for installation: `pip install optimum-benchmark` 🎉 [check it out](https://pypi.org/project/optimum-benchmark/) ! -- numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs 🧠 +- Model loading latency/memory/energy tracking for all backends in the inference scenario 🚀 +- numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs. - 4 minimal docker images (`cpu`, `cuda`, `rocm`, `cuda-ort`) in [packages](https://github.com/huggingface/optimum-benchmark/pkgs/container/optimum-benchmark) for testing, benchmarking and reproducibility 🐳 - vLLM backend for benchmarking [vLLM](https://github.com/vllm-project/vllm)'s inference engine 🚀 - Hosting the codebase of the [LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) 🥇 From 9337b863a5d9e4991ca83847e6c2f66a543b52d9 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:22:15 +0200 Subject: [PATCH 6/6] Update vllm backend to support offline and online serving modes (#232) --- ...aml => test_cli_cuda_vllm_single_gpu.yaml} | 2 +- examples/vllm_llama.yaml | 7 +- optimum_benchmark/backends/config.py | 22 +--- optimum_benchmark/backends/pytorch/config.py | 6 + .../backends/transformers_utils.py | 8 +- optimum_benchmark/backends/vllm/backend.py | 112 ++++++++---------- optimum_benchmark/backends/vllm/config.py | 66 +++++++---- tests/configs/_serving_mode_.yaml | 5 + tests/configs/cuda_inference_vllm_bloom.yaml | 2 + tests/test_cli.py | 8 +- 10 files changed, 131 insertions(+), 107 deletions(-) rename .github/workflows/{test_cli_cuda_vllm.yaml => test_cli_cuda_vllm_single_gpu.yaml} (94%) create mode 100644 tests/configs/_serving_mode_.yaml diff --git a/.github/workflows/test_cli_cuda_vllm.yaml b/.github/workflows/test_cli_cuda_vllm_single_gpu.yaml similarity index 94% rename from .github/workflows/test_cli_cuda_vllm.yaml rename to .github/workflows/test_cli_cuda_vllm_single_gpu.yaml index d4f9042d..66ce017a 100644 --- a/.github/workflows/test_cli_cuda_vllm.yaml +++ b/.github/workflows/test_cli_cuda_vllm_single_gpu.yaml @@ -50,4 +50,4 @@ jobs: run: | pip install packaging pip install -e .[testing,vllm,flash-attn] - pytest -x -s -k "cli and cuda and vllm" + FORCE_SERIAL=1 pytest -x -s -k "cli and cuda and vllm" diff --git a/examples/vllm_llama.yaml b/examples/vllm_llama.yaml index 0700dd3b..8bbb4025 100644 --- a/examples/vllm_llama.yaml +++ b/examples/vllm_llama.yaml @@ -14,9 +14,12 @@ launcher: backend: device: cuda - device_ids: 2 - no_weights: true + device_ids: 0 + no_weights: false + serving_mode: offline model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + engine_args: + enforce_eager: true scenario: input_shapes: diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py index ba09267b..8be19c3d 100644 --- a/optimum_benchmark/backends/config.py +++ b/optimum_benchmark/backends/config.py @@ -30,10 +30,10 @@ class BackendConfig(ABC): processor: Optional[str] = None device: Optional[str] = None - device_ids: Optional[str] = None - # yes we use a string here instead of a list + # we use a string here instead of a list # because it's easier to pass in a yaml or from cli # and it's consistent with GPU environment variables + device_ids: Optional[str] = None seed: int = 42 inter_op_num_threads: Optional[int] = None @@ -44,9 +44,6 @@ class BackendConfig(ABC): # processor kwargs that are added to its init method/constructor processor_kwargs: Dict[str, Any] = field(default_factory=dict) - # deprecated - hub_kwargs: Dict[str, Any] = field(default_factory=dict) - def __post_init__(self): if self.model is None: raise ValueError("`model` must be specified.") @@ -54,23 +51,16 @@ def __post_init__(self): if self.processor is None: self.processor = self.model - if self.hub_kwargs: - LOGGER.warning( - "`hub_kwargs` is deprecated and will be removed in future versions." - "Please use `model_kwargs` and `processor_kwargs` seperately." - ) - self.model_kwargs = {**self.model_kwargs, **self.hub_kwargs} - self.processor_kwargs = {**self.processor_kwargs, **self.hub_kwargs} - + # TODO: add cache_dir, token, etc. to these methods if self.task is None: - self.task = infer_task_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None)) + self.task = infer_task_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None)) if self.library is None: - self.library = infer_library_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None)) + self.library = infer_library_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None)) if self.model_type is None: self.model_type = infer_model_type_from_model_name_or_path( - self.model, self.hub_kwargs.get("revision", None) + self.model, self.model_kwargs.get("revision", None) ) if self.device is None: diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py index a519fa2f..225718e5 100644 --- a/optimum_benchmark/backends/pytorch/config.py +++ b/optimum_benchmark/backends/pytorch/config.py @@ -54,6 +54,12 @@ class PyTorchConfig(BackendConfig): def __post_init__(self): super().__post_init__() + if self.model_kwargs.get("torch_dtype", None) is not None: + raise ValueError( + "`torch_dtype` is an explicit argument in the PyTorch backend config. " + "Please remove it from the `model_kwargs` and set it in the backend config directly." + ) + if self.device_map is not None and self.device_map not in DEVICE_MAPS: raise ValueError(f"`device_map` must be one of {DEVICE_MAPS}. Got {self.device_map} instead.") diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 87755e78..3781da46 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -1,3 +1,4 @@ +import warnings from contextlib import contextmanager from typing import Any, Dict, Optional, Union @@ -107,7 +108,12 @@ def extract_transformers_shapes_from_artifacts( processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None} artifacts_dict.update(processor_dict) elif processor is not None: - processor_dict = {k: getattr(processor, k) for k in dir(processor) if isinstance(getattr(processor, k), int)} + try: + processor_dict = { + k: getattr(processor, k) for k in dir(processor) if isinstance(getattr(processor, k), int) + } + except Exception: + warnings.warn(f"Could not extract shapes from processor {processor}") shapes = {} diff --git a/optimum_benchmark/backends/vllm/backend.py b/optimum_benchmark/backends/vllm/backend.py index 1a28de4a..e90f3e7e 100644 --- a/optimum_benchmark/backends/vllm/backend.py +++ b/optimum_benchmark/backends/vllm/backend.py @@ -1,11 +1,12 @@ +import asyncio import os from tempfile import TemporaryDirectory -from typing import Any, Dict +from typing import Any, Dict, Union import torch from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from safetensors.torch import save_file -from vllm import LLM, SamplingParams +from vllm import AsyncEngineArgs, AsyncLLMEngine, EngineArgs, LLMEngine, SamplingParams from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend @@ -15,6 +16,7 @@ class VLLMBackend(Backend[VLLMConfig]): NAME: str = "vllm" + pretrained_model: Union[LLMEngine, AsyncLLMEngine] def __init__(self, config: VLLMConfig) -> None: super().__init__(config) @@ -97,34 +99,10 @@ def load_model_with_no_weights(self) -> None: self.config.model = original_model def load_model_from_pretrained(self) -> None: - self.pretrained_model = LLM( - model=self.config.model, - # tokenizer - tokenizer=self.config.processor, - tokenizer_mode=self.config.tokenizer_mode, - skip_tokenizer_init=self.config.skip_tokenizer_init, - # device - device=self.config.device, - # parallelism - tensor_parallel_size=self.config.tensor_parallel_size, - # precision - quantization=self.config.quantization, - dtype=self.config.dtype, - # memory - swap_space=self.config.swap_space, - gpu_memory_utilization=self.config.gpu_memory_utilization, - # cuda graphs - enforce_eager=self.config.enforce_eager, - max_context_len_to_capture=self.config.max_context_len_to_capture, - max_seq_len_to_capture=self.config.max_seq_len_to_capture, - # kernels - disable_custom_all_reduce=self.config.disable_custom_all_reduce, - # additional stuff - trust_remote_code=self.config.model_kwargs.get("trust_remote_code", False), - tokenizer_revision=self.config.processor_kwargs.get("revision", None), - revision=self.config.model_kwargs.get("revision", None), - seed=self.config.seed, - ) + if self.config.serving_mode == "offline": + self.pretrained_model = LLMEngine.from_engine_args(EngineArgs(**self.config.to_engine_args())) + else: + self.pretrained_model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**self.config.to_engine_args())) def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: if self.config.task in TEXT_GENERATION_TASKS: @@ -134,11 +112,31 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs - def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any: - return self.pretrained_model.generate( - **inputs, - use_tqdm=False, - sampling_params=SamplingParams( + def batch_offline_engine_generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any: + for i, prompt in enumerate(inputs["prompts"]): + self.pretrained_model.add_request( + inputs=prompt, + request_id=str(i), + params=SamplingParams( + ignore_eos=True, + detokenize=True, + seed=self.config.seed, + n=kwargs.get("num_return_sequences"), + max_tokens=kwargs.get("max_new_tokens"), + min_tokens=kwargs.get("min_new_tokens"), + use_beam_search=kwargs.get("num_beams") > 1, + logits_processors=kwargs.get("logits_processors", None), + ), + ) + + while self.pretrained_model.has_unfinished_requests(): + self.pretrained_model.step() + + async def single_online_engine_generate(self, prompt: str, request_id: str, kwargs: Dict[str, Any]) -> Any: + stream = await self.pretrained_model.add_request( + inputs=prompt, + request_id=request_id, + params=SamplingParams( ignore_eos=True, detokenize=True, seed=self.config.seed, @@ -150,33 +148,23 @@ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any: ), ) + async for _ in stream: + pass + + async def batch_online_engine_generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any: + tasks = [ + self.single_online_engine_generate(prompt, str(i), kwargs) for i, prompt in enumerate(inputs["prompts"]) + ] + await asyncio.gather(*tasks) + def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Dict[str, Any]: - return self.pretrained_model.generate( - **inputs, - use_tqdm=False, - sampling_params=SamplingParams( - ignore_eos=True, - detokenize=True, - seed=self.config.seed, - n=kwargs.get("num_return_sequences"), - max_tokens=kwargs.get("max_new_tokens"), - min_tokens=kwargs.get("min_new_tokens"), - use_beam_search=kwargs.get("num_beams") > 1, - logits_processors=kwargs.get("logits_processors", None), - ), - ) + if self.config.serving_mode == "offline": + self.batch_offline_engine_generate(inputs, kwargs) + else: + asyncio.run(self.batch_online_engine_generate(inputs, kwargs)) def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any: - return self.pretrained_model.generate( - **inputs, - use_tqdm=False, - sampling_params=SamplingParams( - ignore_eos=True, - detokenize=True, - n=kwargs.get("num_return_sequences"), - max_tokens=kwargs.get("max_new_tokens"), - min_tokens=kwargs.get("min_new_tokens"), - use_beam_search=kwargs.get("num_beams") > 1, - logits_processors=kwargs.get("logits_processors", None), - ), - ) + if self.config.serving_mode == "offline": + self.batch_offline_engine_generate(inputs, kwargs) + else: + asyncio.run(self.batch_online_engine_generate(inputs, kwargs)) diff --git a/optimum_benchmark/backends/vllm/config.py b/optimum_benchmark/backends/vllm/config.py index 59cb859c..44bd9428 100644 --- a/optimum_benchmark/backends/vllm/config.py +++ b/optimum_benchmark/backends/vllm/config.py @@ -1,5 +1,5 @@ -from dataclasses import dataclass -from typing import Optional +from dataclasses import dataclass, field +from typing import Any, Dict, Optional from ...import_utils import vllm_version from ..config import BackendConfig @@ -11,36 +11,54 @@ class VLLMConfig(BackendConfig): version: Optional[str] = vllm_version() _target_: str = "optimum_benchmark.backends.vllm.backend.VLLMBackend" - # optimum-benchmark + # creates a model from scratch with dummy weights no_weights: bool = False - # tokenizer - tokenizer_mode: str = "auto" - skip_tokenizer_init: bool = False + # decides whether to use the offline or online llm engine + serving_mode: str = "online" - # parallelism - tensor_parallel_size: int = 1 + # passed to EngineArgs + engine_args: Dict[str, Any] = field(default_factory=dict) - # precision - dtype: str = "auto" - quantization: Optional[str] = None + def __post_init__(self): + # duplicates that are handled by the backend config directly + if "model" in self.engine_args: + raise ValueError("model should not be passed in `backend.engine_args`, use `backend.model` instead") - # cuda graphs - enforce_eager: bool = False - max_context_len_to_capture: Optional[int] = None - max_seq_len_to_capture: int = 8192 + if "tokenizer" in self.engine_args: + raise ValueError("tokenizer should not be passed in `backend.engine_args`, use `backend.processor` instead") - # kernels - disable_custom_all_reduce: bool = False + if "device" in self.engine_args: + raise ValueError("device should not be passed in `backend.engine_args`, use `backend.device` instead") - # memory - gpu_memory_utilization: float = 0.9 - swap_space: int = 4 + if self.serving_mode not in ["offline", "online"]: + raise ValueError(f"Invalid serving_mode: {self.serving_mode}. Must be 'online' or 'offline'.") + + # needed for task/library/model_type inference + self.model_kwargs = { + "revision": self.engine_args.get("revision", "main"), + "trust_remote_code": self.engine_args.get("trust_remote_code", False), + **self.model_kwargs, + } + self.processor_kwargs = { + "revision": self.engine_args.get("tokenizer_revision", "main"), + "trust_remote_code": self.engine_args.get("trust_remote_code", False), + **self.processor_kwargs, + } - def __post_init__(self): super().__post_init__() - self.device = self.device.lower() + if self.engine_args.get("disable_log_stats", None) is None: + self.engine_args["disable_log_stats"] = True + + if self.serving_mode == "online": + if self.engine_args.get("disable_log_requests", None) is None: + self.engine_args["disable_log_requests"] = True - if self.device not in ["cuda", "neuron", "cpu"]: - raise ValueError(f"VLLM Backend only supports 'cpu', 'cuda' and 'neuron' devices, got {self.device}") + def to_engine_args(self) -> Dict[str, Any]: + return dict( + model=self.model, + tokenizer=self.processor, + device=self.device, + **self.engine_args, + ) diff --git a/tests/configs/_serving_mode_.yaml b/tests/configs/_serving_mode_.yaml new file mode 100644 index 00000000..4b7523a0 --- /dev/null +++ b/tests/configs/_serving_mode_.yaml @@ -0,0 +1,5 @@ +hydra: + mode: MULTIRUN + sweeper: + params: + backend.serving_mode: online,offline diff --git a/tests/configs/cuda_inference_vllm_bloom.yaml b/tests/configs/cuda_inference_vllm_bloom.yaml index 9c1cb304..ba9d92af 100644 --- a/tests/configs/cuda_inference_vllm_bloom.yaml +++ b/tests/configs/cuda_inference_vllm_bloom.yaml @@ -3,6 +3,8 @@ defaults: - _base_ # inherits from base config - _cuda_ # inherits from cuda config - _inference_ # inherits from inference config + - _serving_mode_ # inherits from serving_mode config + - _no_weights_ # inherits from no weights config - _bloom_ # inherits from bloom config - _self_ # hydra 1.1 compatibility - override backend: vllm diff --git a/tests/test_cli.py b/tests/test_cli.py index 156ec5a1..806eedfa 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -8,6 +8,7 @@ LOGGER = getLogger("test") +FORCE_SERIAL = os.environ.get("FORCE_SERIAL", "0") == "1" TEST_CONFIG_DIR = "/".join(__file__.split("/")[:-1] + ["configs"]) TEST_CONFIG_NAMES = [ config.split(".")[0] @@ -24,12 +25,17 @@ def test_cli_configs(config_name): TEST_CONFIG_DIR, "--config-name", config_name, - # to run the tests faster (comment for debugging) + # to run the tests faster "hydra/launcher=joblib", "hydra.launcher.batch_size=1", "hydra.launcher.prefer=threads", ] + if FORCE_SERIAL: + args += ["hydra.launcher.n_jobs=1"] + else: + args += ["hydra.launcher.n_jobs=-1"] + popen = run_subprocess_and_log_stream_output(LOGGER, args) assert popen.returncode == 0, f"Failed to run {config_name}"