From 2a75c0bc0d007cc875fa0f75ca41d02e46f917be Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Wed, 3 Jul 2024 16:05:34 +0200
Subject: [PATCH 1/6] Fix per token latency  (#223)

---
 optimum_benchmark/trackers/latency.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
index 340fcc61..1f74a377 100644
--- a/optimum_benchmark/trackers/latency.py
+++ b/optimum_benchmark/trackers/latency.py
@@ -262,7 +262,7 @@ def __init__(self, device: str, backend: str):
             LOGGER.info("\t+ Tracking latency using CPU performance counter")
 
         self.start_time: Optional[float] = None
-        self.next_is_prefill_end_decode_start: Optional[bool] = None
+        self.prefilled: Optional[bool] = None
 
         self.per_token_events: List[Union[float, torch.cuda.Event]] = []
         self.prefill_start_events: List[Union[float, torch.cuda.Event]] = []
@@ -272,7 +272,7 @@ def __init__(self, device: str, backend: str):
 
     def reset(self):
         self.start_time = None
-        self.next_is_prefill_end_decode_start = None
+        self.prefilled = None
 
         self.per_token_events = []
         self.prefill_start_events = []
@@ -291,11 +291,13 @@ def track(self):
         else:
             self.prefill_start_events.append(time.perf_counter())
 
-        self.next_is_prefill_end_decode_start = True  # this is used to record the end of prefill and start of decode
+        self.prefilled = False
 
-        yield  # this is where generate is called, and for each decoded token, we record an event
+        # this is where generate is called,
+        # and for each decoded token, we record an event
+        yield
 
-        self.next_is_prefill_end_decode_start = None
+        self.prefilled = None
 
         if self.is_asynchronous:
             self.decode_end_events.append(torch.cuda.Event(enable_timing=True))
@@ -308,7 +310,7 @@ def track(self):
 
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
         assert (
-            self.next_is_prefill_end_decode_start is not None
+            self.prefilled is not None
         ), "PerTokenLatencyLogitsProcessor should only be called inside of track() context"
 
         if self.is_asynchronous:
@@ -317,12 +319,12 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
         else:
             event = time.perf_counter()
 
-        if self.next_is_prefill_end_decode_start:
+        self.per_token_events.append(event)
+
+        if not self.prefilled:
             self.prefill_end_events.append(event)
             self.decode_start_events.append(event)
-            self.next_is_prefill_end_decode_start = False
-        else:
-            self.per_token_events.append(event)
+            self.prefilled = True
 
         return scores
 

From 8ebe8531a5b12ea7926cab66905381c76b24fcc4 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Wed, 3 Jul 2024 16:45:43 +0200
Subject: [PATCH 2/6] Patch release (#224)

---
 optimum_benchmark/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum_benchmark/version.py b/optimum_benchmark/version.py
index e1107dcd..d4044aeb 100644
--- a/optimum_benchmark/version.py
+++ b/optimum_benchmark/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "0.3.0"
+__version__ = "0.3.1"

From 79990507b694d513bac81e140baff3af23a6bff7 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Wed, 3 Jul 2024 17:28:10 +0200
Subject: [PATCH 3/6] Per token latency outliers (#225)

---
 optimum_benchmark/import_utils.py     | 12 ++++++------
 optimum_benchmark/trackers/latency.py | 25 ++++++++++++++-----------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py
index 09a2a08d..7fdff6ef 100644
--- a/optimum_benchmark/import_utils.py
+++ b/optimum_benchmark/import_utils.py
@@ -246,16 +246,16 @@ def get_hf_libs_info():
     return {
         "optimum_benchmark_version": optimum_benchmark_version(),
         "optimum_benchmark_commit": get_git_revision_hash("optimum_benchmark"),
-        "transformers_version": transformers_version(),
+        "transformers_version": transformers_version() if is_transformers_available() else None,
         "transformers_commit": get_git_revision_hash("transformers"),
-        "accelerate_version": accelerate_version(),
+        "accelerate_version": accelerate_version() if is_accelerate_available else None,
         "accelerate_commit": get_git_revision_hash("accelerate"),
-        "diffusers_version": diffusers_version(),
+        "diffusers_version": diffusers_version() if is_diffusers_available() else None,
         "diffusers_commit": get_git_revision_hash("diffusers"),
-        "optimum_version": optimum_version(),
+        "optimum_version": optimum_version() if is_optimum_available() else None,
         "optimum_commit": get_git_revision_hash("optimum"),
-        "timm_version": timm_version(),
+        "timm_version": timm_version() if is_timm_available() else None,
         "timm_commit": get_git_revision_hash("timm"),
-        "peft_version": peft_version(),
+        "peft_version": peft_version() if is_peft_available() else None,
         "peft_commit": get_git_revision_hash("peft"),
     }
diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
index 1f74a377..6b8d614f 100644
--- a/optimum_benchmark/trackers/latency.py
+++ b/optimum_benchmark/trackers/latency.py
@@ -264,7 +264,7 @@ def __init__(self, device: str, backend: str):
         self.start_time: Optional[float] = None
         self.prefilled: Optional[bool] = None
 
-        self.per_token_events: List[Union[float, torch.cuda.Event]] = []
+        self.per_token_events: List[List[Union[float, torch.cuda.Event]]] = []
         self.prefill_start_events: List[Union[float, torch.cuda.Event]] = []
         self.prefill_end_events: List[Union[float, torch.cuda.Event]] = []
         self.decode_start_events: List[Union[float, torch.cuda.Event]] = []
@@ -282,6 +282,9 @@ def reset(self):
 
     @contextmanager
     def track(self):
+        self.prefilled = False
+        self.per_token_events.append([])
+
         if self.is_distributed:
             torch.distributed.barrier()
 
@@ -291,14 +294,10 @@ def track(self):
         else:
             self.prefill_start_events.append(time.perf_counter())
 
-        self.prefilled = False
-
         # this is where generate is called,
         # and for each decoded token, we record an event
         yield
 
-        self.prefilled = None
-
         if self.is_asynchronous:
             self.decode_end_events.append(torch.cuda.Event(enable_timing=True))
             self.decode_end_events[-1].record()
@@ -308,6 +307,8 @@ def track(self):
         if self.is_distributed:
             torch.distributed.barrier()
 
+        self.prefilled = False
+
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
         assert (
             self.prefilled is not None
@@ -319,13 +320,13 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
         else:
             event = time.perf_counter()
 
-        self.per_token_events.append(event)
-
         if not self.prefilled:
             self.prefill_end_events.append(event)
             self.decode_start_events.append(event)
             self.prefilled = True
 
+        self.per_token_events[-1].append(event)
+
         return scores
 
     def get_prefill_latency(self) -> Latency:
@@ -368,13 +369,15 @@ def get_per_token_latency(self) -> Latency:
             torch.cuda.synchronize()
 
             latencies_list = [
-                self.per_token_events[i].elapsed_time(self.per_token_events[i + 1]) / 1e3
-                for i in range(0, len(self.per_token_events) - 1)
+                self.per_token_events[i][j].elapsed_time(self.per_token_events[i][j + 1]) / 1e3
+                for i in range(len(self.per_token_events))
+                for j in range(0, len(self.per_token_events[i]) - 1)
             ]
         else:
             latencies_list = [
-                (self.per_token_events[i + 1] - self.per_token_events[i])
-                for i in range(0, len(self.per_token_events) - 1)
+                (self.per_token_events[i][j + 1] - self.per_token_events[i][j])
+                for i in range(len(self.per_token_events))
+                for j in range(0, len(self.per_token_events[i]) - 1)
             ]
 
         assert not any(latency < 0 for latency in latencies_list), "Negative latency detected"

From e291e9b18cfb323457af60f15f2bb33803718668 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:51:54 +0200
Subject: [PATCH 4/6] Refactor backends and add `load` tracking (#227)

---
 examples/pytorch_llama.py                     |  63 ++++
 ...orch_llama_awq.yaml => pytorch_llama.yaml} |  23 +-
 examples/pytorch_llama_awq.py                 |  28 --
 optimum_benchmark/backends/base.py            |  65 ++--
 optimum_benchmark/backends/config.py          |  12 +-
 optimum_benchmark/backends/diffusers_utils.py |  39 ++-
 .../backends/llm_swarm/backend.py             |  21 +-
 .../backends/neural_compressor/backend.py     |  71 ++--
 .../backends/onnxruntime/backend.py           | 142 ++++----
 .../backends/onnxruntime/utils.py             |  20 +-
 .../backends/openvino/backend.py              | 162 +++++----
 optimum_benchmark/backends/openvino/utils.py  |  23 +-
 optimum_benchmark/backends/py_txi/backend.py  |  28 +-
 optimum_benchmark/backends/pytorch/backend.py | 318 ++++++++++--------
 .../backends/tensorrt_llm/backend.py          |  33 +-
 optimum_benchmark/backends/timm_utils.py      |  13 +-
 .../backends/torch_ort/backend.py             |  41 +--
 .../backends/transformers_utils.py            |  78 ++++-
 optimum_benchmark/backends/vllm/backend.py    |  43 ++-
 .../launchers/process/launcher.py             |   9 +-
 .../scenarios/energy_star/scenario.py         |  11 +-
 .../scenarios/inference/config.py             |  16 +-
 .../scenarios/inference/scenario.py           | 104 ++++--
 .../scenarios/training/scenario.py            |  27 +-
 optimum_benchmark/task_utils.py               | 268 +++++++--------
 optimum_benchmark/trackers/latency.py         |  19 +-
 tests/configs/_bert_.yaml                     |   1 +
 tests/configs/_diffusers_.yaml                |   2 +-
 tests/configs/cuda_inference_py_txi_bert.yaml |   2 +-
 tests/test_api.py                             |   2 +-
 30 files changed, 940 insertions(+), 744 deletions(-)
 create mode 100644 examples/pytorch_llama.py
 rename examples/{pytorch_llama_awq.yaml => pytorch_llama.yaml} (51%)
 delete mode 100644 examples/pytorch_llama_awq.py

diff --git a/examples/pytorch_llama.py b/examples/pytorch_llama.py
new file mode 100644
index 00000000..5ecf5573
--- /dev/null
+++ b/examples/pytorch_llama.py
@@ -0,0 +1,63 @@
+import os
+
+from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+BENCHMARK_NAME = "pytorch-llama"
+
+WEIGHTS_CONFIGS = {
+    "float16": {
+        "torch_dtype": "float16",
+        "quantization_scheme": None,
+        "quantization_config": {},
+    },
+    # "4bit-awq-gemm": {
+    #     "torch_dtype": "float16",
+    #     "quantization_scheme": "awq",
+    #     "quantization_config": {"bits": 4, "version": "gemm"},
+    # },
+    # "4bit-gptq-exllama-v2": {
+    #     "torch_dtype": "float16",
+    #     "quantization_scheme": "gptq",
+    #     "quantization_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
+    # },
+}
+
+
+def run_benchmark(weight_config: str):
+    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn")
+    backend_config = PyTorchConfig(
+        device="cuda",
+        device_ids="0",
+        no_weights=True,
+        model="gpt2",
+        **WEIGHTS_CONFIGS[weight_config],
+    )
+    scenario_config = InferenceConfig(
+        memory=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes={"batch_size": 1, "sequence_length": 128},
+        generate_kwargs={"max_new_tokens": 32, "min_new_tokens": 32},
+    )
+
+    benchmark_config = BenchmarkConfig(
+        name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config
+    )
+    benchmark_report = Benchmark.launch(benchmark_config)
+    benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+
+    filename = f"{BENCHMARK_NAME}-{backend_config.version}-{weight_config}.json"
+    benchmark.push_to_hub(repo_id="optimum-benchmark/pytorch-llama", filename=filename)
+    benchmark.save_json(path=f"benchmarks/{filename}")
+
+
+if __name__ == "__main__":
+    level = os.environ.get("LOG_LEVEL", "INFO")
+    to_file = os.environ.get("LOG_TO_FILE", "0") == "1"
+    setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS")
+
+    for weight_config in WEIGHTS_CONFIGS:
+        run_benchmark(weight_config)
diff --git a/examples/pytorch_llama_awq.yaml b/examples/pytorch_llama.yaml
similarity index 51%
rename from examples/pytorch_llama_awq.yaml
rename to examples/pytorch_llama.yaml
index 34c8e957..becd1f2e 100644
--- a/examples/pytorch_llama_awq.yaml
+++ b/examples/pytorch_llama.yaml
@@ -3,24 +3,31 @@ defaults:
   - scenario: inference
   - launcher: process
   - backend: pytorch
+  - _base_
   - _self_
 
-experiment_name: pytorch_llama_awq
+name: pytorch_llama
 
 launcher:
   device_isolation: true
   device_isolation_action: warn
 
 backend:
+  model: gpt2
   device: cuda
-  device_ids: 0
-  no_weights: true
-  model: TheBloke/Llama-2-70B-AWQ
+  torch_dtype: float16
+
+scenario:
+  memory: true
+  latency: true
+
+  warmup_runs: 10
+  iterations: 10
+  duration: 10
 
-benchmark:
   input_shapes:
     batch_size: 1
-    sequence_length: 128
+    sequence_length: 256
   generate_kwargs:
-    max_new_tokens: 100
-    min_new_tokens: 100
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/pytorch_llama_awq.py b/examples/pytorch_llama_awq.py
deleted file mode 100644
index 96e100a9..00000000
--- a/examples/pytorch_llama_awq.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
-from optimum_benchmark.logging_utils import setup_logging
-
-setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-if __name__ == "__main__":
-    BENCHMARK_NAME = "pytorch_llama_awq"
-    REPO_ID = f"IlyasMoutawwakil/{BENCHMARK_NAME}"
-
-    scenario_config = InferenceConfig(
-        memory=True,
-        latency=True,
-        input_shapes={"batch_size": 1, "sequence_length": 128},
-        generate_kwargs={"max_new_tokens": 100, "min_new_tokens": 100},
-    )
-    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn")
-    backend_config = PyTorchConfig(device="cuda", device_ids="0", no_weights=True, model="TheBloke/Llama-2-70B-AWQ")
-
-    benchmark_config = BenchmarkConfig(
-        name=BENCHMARK_NAME, launcher=launcher_config, scenario=scenario_config, backend=backend_config
-    )
-    # benchmark_config.push_to_hub(repo_id=REPO_ID)
-
-    benchmark_report = Benchmark.launch(benchmark_config)
-    # benchmark_report.push_to_hub(repo_id=REPO_ID)
-
-    benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-    # benchmark.push_to_hub(repo_id=REPO_ID)
diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
index 8e6b9f89..8ae7e2cf 100644
--- a/optimum_benchmark/backends/base.py
+++ b/optimum_benchmark/backends/base.py
@@ -1,24 +1,34 @@
+import os
 from abc import ABC
 from collections import OrderedDict
 from logging import getLogger
-from typing import Any, ClassVar, Dict, Generic, Optional, Tuple
+from typing import Any, ClassVar, Dict, Generic, Optional
 
 import datasets.utils.logging as datasets_logging
 import transformers.utils.logging as transformers_logging
+from safetensors.torch import save_file
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState, set_seed
 
-from ..task_utils import get_automodel_class_for_task
+from ..import_utils import is_torch_available
 from .config import BackendConfigT
-from .diffusers_utils import extract_diffusers_shapes_from_model, get_diffusers_pretrained_config
-from .timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config
+from .diffusers_utils import (
+    extract_diffusers_shapes_from_model,
+    get_diffusers_automodel_loader_for_task,
+    get_diffusers_pretrained_config,
+)
+from .timm_utils import extract_timm_shapes_from_config, get_timm_automodel_loader, get_timm_pretrained_config
 from .transformers_utils import (
     PretrainedProcessor,
     extract_transformers_shapes_from_artifacts,
+    get_transformers_automodel_loader_for_task,
     get_transformers_generation_config,
     get_transformers_pretrained_config,
     get_transformers_pretrained_processor,
 )
 
+if is_torch_available():
+    import torch
+
 datasets_logging.set_verbosity_error()
 transformers_logging.set_verbosity_error()
 
@@ -47,7 +57,7 @@ def __init__(self, config: BackendConfigT):
             self.logger.info("\t+ Benchmarking a Diffusers pipeline")
             self.pretrained_config = get_diffusers_pretrained_config(self.config.model, **self.config.model_kwargs)
             self.model_shapes = extract_diffusers_shapes_from_model(self.config.model, **self.config.model_kwargs)
-            self.model_type = self.config.task
+            self.automodel_loader = get_diffusers_automodel_loader_for_task(self.config.task)
             self.pretrained_processor = None
             self.generation_config = None
 
@@ -55,7 +65,7 @@ def __init__(self, config: BackendConfigT):
             self.logger.info("\t+ Benchmarking a Timm model")
             self.pretrained_config = get_timm_pretrained_config(self.config.model)
             self.model_shapes = extract_timm_shapes_from_config(self.pretrained_config)
-            self.model_type = self.pretrained_config.architecture
+            self.automodel_loader = get_timm_automodel_loader()
             self.pretrained_processor = None
             self.generation_config = None
 
@@ -69,31 +79,42 @@ def __init__(self, config: BackendConfigT):
             self.model_shapes = extract_transformers_shapes_from_artifacts(
                 self.pretrained_config, self.pretrained_processor
             )
-            self.model_type = self.pretrained_config.model_type
-
-        self.automodel_class = get_automodel_class_for_task(
-            model_type=self.model_type, library=self.config.library, task=self.config.task, framework="pt"
-        )
-        self.logger.info(f"\t+ Using automodel class {self.automodel_class.__name__}")
+            self.automodel_loader = get_transformers_automodel_loader_for_task(self.config.task)
 
     def seed(self) -> None:
         set_seed(self.config.seed)
 
-    def prepare_for_inference(self, **kwargs) -> None:
+    def create_no_weights_model(self) -> None:
+        if self.pretrained_config is None:
+            raise ValueError("Can't create no weights model without a pretrained config")
+
+        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
+        self.logger.info("\t+ Creating no weights model's directory")
+        os.makedirs(self.no_weights_model, exist_ok=True)
+        self.logger.info("\t+ Creating no weights model's state dict")
+        state_dict = torch.nn.Linear(1, 1).state_dict()
+        self.logger.info("\t+ Saving no weights model's safetensors")
+        safetensors = os.path.join(self.no_weights_model, "model.safetensors")
+        save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"})
+        self.logger.info("\t+ Saving no weights model's config")
+        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+
+    def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
         """
-        This method is used to prepare the model for inference.
-        It can be used to compile the model with certain input/output shapes, for example.
+        This method is used to prepare and register the input shapes before using them by the model.
+        It can be used to pad the inputs to the correct shape, or compile it to the correct format.
         """
-        pass
+        return input_shapes
 
-    def prepare_inputs(
-        self, inputs: Dict[str, Any], input_shapes: Dict[str, Any]
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """
-        This method is used to prepare the inputs before passing them to the model.
-        It can be used to move the inputs to the correct device, for example.
+        This method is used to prepare and register the inputs before passing them to the model.
+        It can be used to move the inputs to the correct device, or rename their keys.
         """
-        return inputs, input_shapes
+        return inputs
+
+    def load(self) -> None:
+        raise NotImplementedError("Backend must implement load method")
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         """
diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
index 9113d717..ba09267b 100644
--- a/optimum_benchmark/backends/config.py
+++ b/optimum_benchmark/backends/config.py
@@ -7,7 +7,11 @@
 from psutil import cpu_count
 
 from ..system_utils import get_gpu_device_ids, is_nvidia_system, is_rocm_system
-from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path
+from ..task_utils import (
+    infer_library_from_model_name_or_path,
+    infer_model_type_from_model_name_or_path,
+    infer_task_from_model_name_or_path,
+)
 
 LOGGER = getLogger("backend")
 
@@ -20,6 +24,7 @@ class BackendConfig(ABC):
 
     task: Optional[str] = None
     library: Optional[str] = None
+    model_type: Optional[str] = None
 
     model: Optional[str] = None
     processor: Optional[str] = None
@@ -63,6 +68,11 @@ def __post_init__(self):
         if self.library is None:
             self.library = infer_library_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None))
 
+        if self.model_type is None:
+            self.model_type = infer_model_type_from_model_name_or_path(
+                self.model, self.hub_kwargs.get("revision", None)
+            )
+
         if self.device is None:
             self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu"
 
diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py
index a9b5b5a7..ef6f9376 100644
--- a/optimum_benchmark/backends/diffusers_utils.py
+++ b/optimum_benchmark/backends/diffusers_utils.py
@@ -5,11 +5,40 @@
 from ..import_utils import is_diffusers_available
 
 if is_diffusers_available():
-    import diffusers  # type: ignore
+    import diffusers
+    from diffusers import DiffusionPipeline
+
+    if hasattr(diffusers, "pipelines") and hasattr(diffusers.pipelines, "auto_pipeline"):
+        from diffusers.pipelines.auto_pipeline import (
+            AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+            AUTO_INPAINT_PIPELINES_MAPPING,
+            AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+        )
+
+        TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {
+            "inpainting": AUTO_INPAINT_PIPELINES_MAPPING.copy(),
+            "text-to-image": AUTO_TEXT2IMAGE_PIPELINES_MAPPING.copy(),
+            "image-to-image": AUTO_IMAGE2IMAGE_PIPELINES_MAPPING.copy(),
+        }
+
+        for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES.items():
+            for model_type, model_class in model_mapping.items():
+                TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name][model_type] = model_class.__name__
+    else:
+        TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {}
+else:
+    TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {}
+
+
+TASKS_TO_MODEL_LOADERS = {
+    "inpainting": "AutoPipelineForInpainting",
+    "text-to-image": "AutoPipelineForText2Image",
+    "image-to-image": "AutoPipelineForImage2Image",
+}
 
 
 def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]:
-    return diffusers.DiffusionPipeline.load_config(model, **kwargs)
+    return DiffusionPipeline.load_config(model, **kwargs)
 
 
 def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]:
@@ -38,3 +67,9 @@ def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]:
         shapes["width"] = -1
 
     return shapes
+
+
+def get_diffusers_automodel_loader_for_task(task: str):
+    model_loader_name = TASKS_TO_MODEL_LOADERS[task]
+    model_loader_class = getattr(diffusers, model_loader_name)
+    return model_loader_class
diff --git a/optimum_benchmark/backends/llm_swarm/backend.py b/optimum_benchmark/backends/llm_swarm/backend.py
index dd08b9b7..8139e4ea 100644
--- a/optimum_benchmark/backends/llm_swarm/backend.py
+++ b/optimum_benchmark/backends/llm_swarm/backend.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 import torch
 from huggingface_hub import AsyncInferenceClient
@@ -16,8 +16,11 @@ class LLMSwarmBackend(Backend[LLMSwarmConfig]):
 
     def __init__(self, config: LLMSwarmConfig) -> None:
         super().__init__(config)
-        self.validate_task()
 
+        if self.config.task not in TEXT_GENERATION_TASKS:
+            raise NotImplementedError(f"LLM Swarm does not support task {self.config.task}")
+
+    def load(self) -> None:
         self.logger.info("\t+ Downloading pretrained model")
         self.download_pretrained_model()
         self.logger.info("\t+ Preparing generation config")
@@ -25,10 +28,6 @@ def __init__(self, config: LLMSwarmConfig) -> None:
         self.logger.info("\t+ Loading pretrained model")
         self.load_model_from_pretrained()
 
-    def validate_task(self) -> None:
-        if self.config.task not in TEXT_GENERATION_TASKS:
-            raise NotImplementedError(f"LLM Swarm does not support task {self.config.task}")
-
     def load_model_from_pretrained(self) -> None:
         self.llm_swarm_config = LLMSwarmCfg(
             gpus=self.config.gpus,
@@ -46,7 +45,7 @@ def load_model_from_pretrained(self) -> None:
 
     def download_pretrained_model(self) -> None:
         with torch.device("meta"):
-            self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs)
+            self.auto_model_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
 
     def prepare_generation_config(self) -> None:
         self.generation_config.eos_token_id = -100
@@ -60,11 +59,7 @@ def prepare_generation_config(self) -> None:
         self.logger.info("\t+ Saving new pretrained generation config")
         self.generation_config.save_pretrained(save_directory=model_snapshot_path)
 
-    def prepare_inputs(
-        self, inputs: Dict[str, Any], input_shapes: Dict[str, Any]
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, input_shapes = super().prepare_inputs(inputs, input_shapes)
-
+    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if "inputs" in inputs:
             inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["inputs"].tolist())}
         elif "input_ids" in inputs:
@@ -72,7 +67,7 @@ def prepare_inputs(
         else:
             raise ValueError("inputs must contain either input_ids or inputs")
 
-        return inputs, input_shapes
+        return inputs
 
     async def single_client_call(self, prompt: str, kwargs: Dict[str, Any]) -> str:
         return await self.client.text_generation(prompt, max_new_tokens=kwargs.get("max_new_tokens", 1))
diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py
index 7ee43635..c180a5ba 100644
--- a/optimum_benchmark/backends/neural_compressor/backend.py
+++ b/optimum_benchmark/backends/neural_compressor/backend.py
@@ -10,7 +10,7 @@
 
 from ...generators.dataset_generator import DatasetGenerator
 from ..base import Backend
-from ..transformers_utils import random_init_weights
+from ..transformers_utils import fast_weights_init
 from .config import INCConfig
 from .utils import TASKS_TO_INCMODELS
 
@@ -20,89 +20,82 @@ class INCBackend(Backend[INCConfig]):
 
     def __init__(self, config: INCConfig):
         super().__init__(config)
-        self.validate_task()
 
+        if self.config.task in TASKS_TO_INCMODELS:
+            self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task])
+            self.logger.info(f"Using INCModel class {self.incmodel_class.__name__}")
+        else:
+            raise NotImplementedError(f"INCBackend does not support task {self.config.task}")
+
+    def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
         if self.config.ptq_quantization:
             if self.config.no_weights:
+                self.logger.info("\t+ Creating no weights AutoModel")
+                self.create_no_weights_model()
                 self.logger.info("\t+ Loading no weights AutoModel")
                 self.load_automodel_with_no_weights()
             else:
                 self.logger.info("\t+ Loading pretrained AutoModel")
                 self.load_automodel_from_pretrained()
-
             self.logger.info("\t+ Applying post-training quantization")
             self.quantize_automodel()
-
             self.logger.info("\t+ Loading quantized INCModel")
             original_model, self.config.model = self.config.model, self.quantized_model
             self.load_incmodel_from_pretrained()
             self.config.model = original_model
-
         elif self.config.no_weights:
+            self.logger.info("\t+ Creating no weights INCModel")
+            self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights INCModel")
             self.load_incmodel_with_no_weights()
-
         else:
             self.logger.info("\t+ Loading pretrained INCModel")
             self.load_incmodel_from_pretrained()
 
         self.tmpdir.cleanup()
 
-    def validate_task(self) -> None:
-        if self.config.task not in TASKS_TO_INCMODELS:
-            raise NotImplementedError(f"INCBackend does not support task {self.config.task}")
-
-        self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task])
-        self.logger.info(f"Using INCModel class {self.incmodel_class.__name__}")
-
     def load_automodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs)
-
-    def create_no_weights_model(self) -> None:
-        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
-        self.logger.info("\t+ Creating no weights model directory")
-        os.makedirs(self.no_weights_model, exist_ok=True)
-        self.logger.info("\t+ Creating no weights model state dict")
-        state_dict = torch.nn.Linear(1, 1).state_dict()
-        self.logger.info("\t+ Saving no weights model pytorch_model.bin")
-        torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin"))
-
-        if self.config.library == "transformers":
-            self.logger.info("\t+ Saving no weights model pretrained config")
-            self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+        self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
 
     def load_automodel_with_no_weights(self) -> None:
-        self.logger.info("\t+ Creating no weights model")
-        self.create_no_weights_model()
+        original_model, self.config.model = self.config.model, self.no_weights_model
 
-        with random_init_weights():
-            original_model, self.config.model = self.config.model, self.no_weights_model
-            self.logger.info("\t+ Loading no weights AutoModel")
+        with fast_weights_init():
             self.load_automodel_from_pretrained()
-            self.config.model = original_model
 
         self.logger.info("\t+ Tying model weights")
         self.pretrained_model.tie_weights()
 
+        self.config.model = original_model
+
     def load_incmodel_from_pretrained(self) -> None:
         self.pretrained_model = self.incmodel_class.from_pretrained(self.config.model, **self.config.model_kwargs)
 
     def load_incmodel_with_no_weights(self) -> None:
-        self.logger.info("\t+ Creating no weights model")
-        self.create_no_weights_model()
+        original_model, self.config.model = self.config.model, self.no_weights_model
 
-        with random_init_weights():
-            original_model, self.config.model = self.config.model, self.no_weights_model
-            self.logger.info("\t+ Loading no weights INCModel")
+        with fast_weights_init():
             self.load_incmodel_from_pretrained()
-            self.config.model = original_model
 
         self.logger.info("\t+ Tying model weights")
         self.pretrained_model.model.tie_weights()
 
+        self.config.model = original_model
+
+    def create_no_weights_model(self) -> None:
+        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
+        self.logger.info("\t+ Creating no weights model directory")
+        os.makedirs(self.no_weights_model, exist_ok=True)
+        self.logger.info("\t+ Creating no weights model state dict")
+        state_dict = torch.nn.Linear(1, 1).state_dict()
+        self.logger.info("\t+ Saving no weights model pytorch_model.bin")
+        torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin"))
+        self.logger.info("\t+ Saving no weights model pretrained config")
+        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+
     def quantize_automodel(self) -> None:
         self.quantized_model = f"{self.tmpdir.name}/quantized_model"
         self.logger.info("\t+ Processing quantization config")
diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py
index de8c52ac..8fb69254 100644
--- a/optimum_benchmark/backends/onnxruntime/backend.py
+++ b/optimum_benchmark/backends/onnxruntime/backend.py
@@ -1,12 +1,17 @@
 import os
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict
 
 import torch
 from hydra.utils import get_class
 from onnxruntime import SessionOptions
-from optimum.onnxruntime import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ORTOptimizer, ORTQuantizer
+from optimum.onnxruntime import (
+    ONNX_DECODER_NAME,
+    ONNX_DECODER_WITH_PAST_NAME,
+    ORTOptimizer,
+    ORTQuantizer,
+)
 from optimum.onnxruntime.configuration import (
     AutoCalibrationConfig,
     AutoOptimizationConfig,
@@ -15,15 +20,19 @@
     OptimizationConfig,
     QuantizationConfig,
 )
-from safetensors.torch import save_file
 
 from ...generators.dataset_generator import DatasetGenerator
 from ...import_utils import is_accelerate_available, is_torch_distributed_available
 from ...task_utils import TEXT_GENERATION_TASKS
 from ..base import Backend
-from ..transformers_utils import random_init_weights
+from ..transformers_utils import fast_weights_init
 from .config import ORTConfig
-from .utils import TASKS_TO_ORTMODELS, TASKS_TO_ORTSD, format_calibration_config, format_quantization_config
+from .utils import (
+    TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES,
+    TASKS_TO_ORTMODELS,
+    format_calibration_config,
+    format_quantization_config,
+)
 
 if is_accelerate_available():
     from accelerate import Accelerator
@@ -37,7 +46,22 @@ class ORTBackend(Backend[ORTConfig]):
 
     def __init__(self, config: ORTConfig) -> None:
         super().__init__(config)
-        self.validate_task()
+
+        if self.config.task in TASKS_TO_ORTMODELS:
+            self.ort_model_loader = get_class(TASKS_TO_ORTMODELS[self.config.task])
+            self.logger.info(f"Using ORT Model class {self.ort_model_loader.__name__}")
+        elif self.config.task in TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES:
+            if self.config.model_type in TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES[self.config.task]:
+                self.ort_model_loader = get_class(
+                    TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES[self.config.task][self.config.model_type]
+                )
+                self.logger.info(f"Using ORT Pipeline class {self.ort_model_loader.__name__}")
+            else:
+                raise NotImplementedError(
+                    f"ORTBackend does not support model {self.config.model_type} for task {self.config.task}"
+                )
+        else:
+            raise NotImplementedError(f"ORTBackend does not support task {self.config.task}")
 
         self.session_options = SessionOptions()
         if self.config.session_options:
@@ -45,10 +69,19 @@ def __init__(self, config: ORTConfig) -> None:
             for key, value in self.config.session_options.items():
                 setattr(self.session_options, key, value)
 
+    def validate_execution_provider(self) -> None:
+        if not self.pretrained_model.providers[0] == self.config.provider:
+            raise ValueError(
+                f"{self.config.provider} is not first in providers list: {self.pretrained_model.providers}"
+            )
+
+    def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
         if self.config.no_weights:
+            self.logger.info("\t+ Creating no weights ORTModel")
+            self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights ORTModel")
             self.load_ortmodel_with_no_weights()
         else:
@@ -70,55 +103,19 @@ def __init__(self, config: ORTConfig) -> None:
 
         if self.is_optimized or self.is_quantized:
             original_export, self.config.export = self.config.export, False
-            self.logger.info("\t+ Loading optimized/quantized ORTModel")
+            self.logger.info("\t+ Loading optimized/quantized model")
             self.load_ortmodel_from_pretrained()
-            self.config.model, self.config.export = original_model, original_export
-
-        self.validate_provider()
-        self.tmpdir.cleanup()
-
-    def validate_task(self) -> None:
-        if self.config.task in TASKS_TO_ORTSD:
-            self.ortmodel_class = get_class(TASKS_TO_ORTSD[self.config.task])
-            self.logger.info(f"Using ORTStableDiffusion class {self.ortmodel_class.__name__}")
-        elif self.config.task in TASKS_TO_ORTMODELS:
-            self.ortmodel_class = get_class(TASKS_TO_ORTMODELS[self.config.task])
-            self.logger.info(f"Using ORTModel class {self.ortmodel_class.__name__}")
-        else:
-            raise NotImplementedError(f"ORTBackend does not support task {self.config.task}")
-
-    def validate_provider(self) -> None:
-        if not self.pretrained_model.providers[0] == self.config.provider:
-            raise ValueError(
-                f"{self.config.provider} is not first in providers list: {self.pretrained_model.providers}"
-            )
-
-    def create_no_weights_model(self) -> None:
-        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
-        self.logger.info("\t+ Creating no weights model directory")
-        os.makedirs(self.no_weights_model, exist_ok=True)
-        self.logger.info("\t+ Creating no weights model state dict")
-        state_dict = torch.nn.Linear(1, 1).state_dict()
-        self.logger.info("\t+ Saving no weights model safetensors")
-        safetensors = os.path.join(self.no_weights_model, "model.safetensors")
-        save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"})
-
-        if self.config.library == "transformers":
-            self.logger.info("\t+ Saving no weights model pretrained config")
-            self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+            self.config.export = original_export
+            self.config.model = original_model
 
-    def load_ortmodel_with_no_weights(self) -> None:
-        self.logger.info("\t+ Creating no weights model")
-        self.create_no_weights_model()
+        self.logger.info("\t+ Validating requested Execution Provider")
+        self.validate_execution_provider()
 
-        with random_init_weights():
-            original_model, self.config.model = self.config.model, self.no_weights_model
-            self.logger.info("\t+ Loading no weights ORTModel")
-            self.load_ortmodel_from_pretrained()
-            self.config.model = original_model
+        self.logger.info("\t+ Cleaning up backend temporary directory")
+        self.tmpdir.cleanup()
 
     def load_ortmodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.ortmodel_class.from_pretrained(
+        self.pretrained_model = self.ort_model_loader.from_pretrained(
             self.config.model,
             export=self.config.export,
             session_options=self.session_options,
@@ -129,6 +126,14 @@ def load_ortmodel_from_pretrained(self) -> None:
             **self.ortmodel_kwargs,
         )
 
+    def load_ortmodel_with_no_weights(self) -> None:
+        original_model, self.config.model = self.config.model, self.no_weights_model
+
+        with fast_weights_init():
+            self.load_ortmodel_from_pretrained()
+
+        self.config.model = original_model
+
     @property
     def is_optimized(self) -> bool:
         return (self.config.auto_optimization is not None) or self.config.optimization
@@ -167,15 +172,6 @@ def onnx_files_names(self):
         else:
             return [file for file in os.listdir(self.config.model) if file.endswith(".onnx")]
 
-    @property
-    def inputs_names(self) -> List[str]:
-        if hasattr(self.pretrained_model, "inputs_names"):
-            return self.pretrained_model.inputs_names
-        elif hasattr(self.pretrained_model, "input_names"):
-            return self.pretrained_model.input_names
-        else:
-            return []
-
     def optimize_onnx_files(self) -> None:
         self.logger.info("\t+ Attempting optimization")
         self.optimized_model = os.path.join(self.tmpdir.name, "optimized")
@@ -231,7 +227,7 @@ def quantize_onnx_files(self) -> None:
             calibration_dataset = DatasetGenerator(
                 task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes
             )()
-            columns_to_be_removed = list(set(calibration_dataset.column_names) - set(self.inputs_names))
+            columns_to_be_removed = list(set(calibration_dataset.column_names) - set(self.pretrained_model.input_names))
             calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
 
             self.logger.info("\t+ Processing calibration config")
@@ -284,32 +280,34 @@ def quantize_onnx_files(self) -> None:
         if self.pretrained_config is not None:
             self.pretrained_config.save_pretrained(self.quantized_model)
 
-    def prepare_inputs(
-        self, inputs: Dict[str, Any], input_shapes: Dict[str, Any]
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, input_shapes = super().prepare_inputs(inputs, input_shapes)
-
+    def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
         if self.is_dp_distributed:
             if input_shapes["batch_size"] % torch.distributed.get_world_size() != 0:
                 raise ValueError(
-                    f"Batch size {input_shapes['batch_size']} must be divisible by data parallel "
-                    f"world size {torch.distributed.get_world_size()}"
+                    f"Batch size {input_shapes['batch_size']} must be divisible by "
+                    f"data parallel world size {torch.distributed.get_world_size()}"
                 )
-            with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as split_inputs:
-                input_shapes["batch_size"] = input_shapes["batch_size"] // torch.distributed.get_world_size()
-                inputs = split_inputs
+            # distributing batch size across processes
+            input_shapes["batch_size"] //= torch.distributed.get_world_size()
+
+        return input_shapes
+
+    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if self.is_dp_distributed:
+            with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as process_inputs:
+                inputs = process_inputs
 
         if self.config.library == "transformers":
             for key, value in list(inputs.items()):
                 if key in ["position_ids", "token_type_ids"]:
-                    if key not in self.inputs_names:
+                    if key not in self.pretrained_model.input_names:
                         inputs.pop(key)
 
         for key, value in inputs.items():
             if isinstance(value, torch.Tensor):
                 inputs[key] = value.to(self.config.device)
 
-        return inputs, input_shapes
+        return inputs
 
     @torch.inference_mode()
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
diff --git a/optimum_benchmark/backends/onnxruntime/utils.py b/optimum_benchmark/backends/onnxruntime/utils.py
index 86eeeed9..6177ae8e 100644
--- a/optimum_benchmark/backends/onnxruntime/utils.py
+++ b/optimum_benchmark/backends/onnxruntime/utils.py
@@ -3,15 +3,25 @@
 from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantizationMode, QuantType
 from optimum.pipelines import ORT_SUPPORTED_TASKS
 
-TASKS_TO_ORTSD = {
-    "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline",
-    "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionXLPipeline",
-}
-
 TASKS_TO_ORTMODELS = {
     task: f"optimum.onnxruntime.{task_dict['class'][0].__name__}" for task, task_dict in ORT_SUPPORTED_TASKS.items()
 }
 
+TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES = {
+    "text-to-image": {
+        "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline",
+        "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionXLPipeline",
+        "latent-consistency": "optimum.onnxruntime.ORTLatentConsistencyModelPipeline",
+    },
+    "image-to-image": {
+        "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionImg2ImgPipeline",
+        "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionImg2ImgXLPipeline",
+    },
+    "inpainting": {
+        "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionInpaintingPipeline",
+    },
+}
+
 
 def format_calibration_config(calibration_config: Dict[str, Any]) -> None:
     if calibration_config.get("method", None) is not None:
diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py
index 91e8304f..cd2a57af 100644
--- a/optimum_benchmark/backends/openvino/backend.py
+++ b/optimum_benchmark/backends/openvino/backend.py
@@ -1,23 +1,21 @@
 import inspect
-import os
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
-from typing import Any, Dict, Tuple
+from typing import Any, Dict
 
 import torch
 from hydra.utils import get_class
 from openvino.runtime import properties
 from optimum.intel.openvino import OVConfig as OVQuantizationConfig  # naming conflict
 from optimum.intel.openvino import OVQuantizer
-from safetensors.torch import save_file
 
 from ...generators.dataset_generator import DatasetGenerator
 from ...import_utils import is_accelerate_available, is_torch_distributed_available
 from ...task_utils import TEXT_GENERATION_TASKS
 from ..base import Backend
-from ..transformers_utils import random_init_weights
+from ..transformers_utils import fast_weights_init
 from .config import OVConfig
-from .utils import TASKS_TO_OVMODEL
+from .utils import TASKS_TO_MODEL_TYPES_TO_OVPIPELINE, TASKS_TO_OVMODEL
 
 if is_accelerate_available():
     from accelerate import Accelerator
@@ -31,91 +29,94 @@ class OVBackend(Backend[OVConfig]):
 
     def __init__(self, config: OVConfig) -> None:
         super().__init__(config)
-        self.validate_task()
+
+        if self.config.task in TASKS_TO_OVMODEL:
+            self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.config.task])
+            self.logger.info(f"\t+ Using OVModel class {self.ovmodel_class.__name__}")
+        elif self.config.task in TASKS_TO_MODEL_TYPES_TO_OVPIPELINE:
+            if self.config.model_type in TASKS_TO_MODEL_TYPES_TO_OVPIPELINE[self.config.task]:
+                self.ovmodel_class = get_class(
+                    TASKS_TO_MODEL_TYPES_TO_OVPIPELINE[self.config.task][self.config.model_type]
+                )
+                self.logger.info(f"\t+ Using OVPipeline class {self.ovmodel_class.__name__}")
+            else:
+                raise NotImplementedError(
+                    f"OVBackend does not support model {self.config.model_type} for task {self.config.task}"
+                )
+        else:
+            raise NotImplementedError(f"OVBackend does not support task {self.config.task}")
 
         if self.config.inter_op_num_threads is not None:
             self.logger.info(f"\t+ Setting inter_op_num_threads to {self.config.inter_op_num_threads}")
             self.config.openvino_config[properties.inference_num_threads()] = self.config.inter_op_num_threads
 
+    def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
         if self.config.quantization:
             if self.config.no_weights:
+                self.logger.info("\t+ Creating no weights AutoModel")
+                self.create_no_weights_model()
                 self.logger.info("\t+ Loading no weights AutoModel")
-                self.load_automodel_with_no_weights()
+                self._load_automodel_with_no_weights()
             else:
                 self.logger.info("\t+ Loading pretrained AutoModel")
-                self.load_automodel_from_pretrained()
-
+                self._load_automodel_from_pretrained()
             self.logger.info("\t+ Applying post-training quantization")
             self.quantize_automodel()
-
             original_model, self.config.model = self.config.model, self.quantized_model
             original_export, self.config.export = self.config.export, False
             self.logger.info("\t+ Loading quantized OVModel")
-            self.load_ovmodel_from_pretrained()
+            self._load_ovmodel_from_pretrained()
             self.config.model, self.config.export = original_model, original_export
-
         elif self.config.no_weights:
+            self.logger.info("\t+ Creating no weights OVModel")
+            self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights OVModel")
-            self.load_ovmodel_with_no_weights()
+            self._load_ovmodel_with_no_weights()
         else:
             self.logger.info("\t+ Loading pretrained OVModel")
-            self.load_ovmodel_from_pretrained()
+            self._load_ovmodel_from_pretrained()
 
-        self.tmpdir.cleanup()
+        if self.config.reshape:
+            static_shapes = {
+                key: value
+                for key, value in {**self.input_shapes, **self.model_shapes}.items()
+                if key in inspect.getfullargspec(self.pretrained_model.reshape).args
+            }
+            if ("sequence_length" in static_shapes) and ("height" in static_shapes) and ("width" in static_shapes):
+                # for vision models, sequence_length is the number of channels
+                static_shapes["sequence_length"] = self.model_shapes.get("num_channels")
 
-    def validate_task(self) -> None:
-        if self.config.task not in TASKS_TO_OVMODEL:
-            raise NotImplementedError(f"OVBackend does not support task {self.config.task}")
+            self.logger.info(f"\t+ Reshaping model with static shapes: {static_shapes}")
+            self.pretrained_model.reshape(**static_shapes)
+
+        if self.config.half:
+            self.logger.info("\t+ Converting model to half precision")
+            self.pretrained_model.half()
 
-        self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.config.task])
-        self.logger.info(f"\t+ Using OVModel class {self.ovmodel_class.__name__}")
+        if self.config.reshape or self.config.half:
+            self.logger.info("\t+ Compiling model")
+            self.pretrained_model.compile()
 
-    def create_no_weights_model(self) -> None:
-        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
-        self.logger.info("\t+ Creating no weights model directory")
-        os.makedirs(self.no_weights_model, exist_ok=True)
-        self.logger.info("\t+ Creating no weights model state dict")
-        state_dict = torch.nn.Linear(1, 1).state_dict()
-        self.logger.info("\t+ Saving no weights model safetensors")
-        safetensors = os.path.join(self.no_weights_model, "model.safetensors")
-        save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"})
+        self.tmpdir.cleanup()
 
-        if self.config.library == "transformers":
-            self.logger.info("\t+ Saving no weights model pretrained config")
-            self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+    def _load_automodel_from_pretrained(self) -> None:
+        self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
 
-    def load_automodel_with_no_weights(self) -> None:
-        self.logger.info("\t+ Creating no weights model")
-        self.create_no_weights_model()
+    def _load_automodel_with_no_weights(self) -> None:
+        original_model, self.config.model = self.config.model, self.no_weights_model
 
-        with random_init_weights():
-            original_model, self.config.model = self.config.model, self.no_weights_model
-            self.logger.info("\t+ Loading no weights AutoModel")
-            self.load_automodel_from_pretrained()
-            self.config.model = original_model
+        with fast_weights_init():
+            self._load_automodel_from_pretrained()
 
         self.logger.info("\t+ Tying model weights")
         self.pretrained_model.tie_weights()
 
-    def load_automodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs)
-
-    def load_ovmodel_with_no_weights(self) -> None:
-        self.logger.info("\t+ Creating no weights model")
-        self.create_no_weights_model()
+        self.config.model = original_model
 
-        with random_init_weights():
-            original_model, self.config.model = self.config.model, self.no_weights_model
-            original_export, self.config.export = self.config.export, True
-            self.logger.info("\t+ Loading no weights OVModel")
-            self.load_ovmodel_from_pretrained()
-            self.config.model = original_model
-            self.config.export = original_export
-
-    def load_ovmodel_from_pretrained(self) -> None:
+    def _load_ovmodel_from_pretrained(self) -> None:
         self.pretrained_model = self.ovmodel_class.from_pretrained(
             self.config.model,
             export=self.config.export,
@@ -125,6 +126,15 @@ def load_ovmodel_from_pretrained(self) -> None:
             **self.ovmodel_kwargs,
         )
 
+    def _load_ovmodel_with_no_weights(self) -> None:
+        with fast_weights_init():
+            original_model, self.config.model = self.config.model, self.no_weights_model
+            original_export, self.config.export = self.config.export, True
+            self.logger.info("\t+ Loading no weights OVModel")
+            self._load_ovmodel_from_pretrained()
+            self.config.export = original_export
+            self.config.model = original_model
+
     @property
     def is_dp_distributed(self) -> bool:
         return is_torch_distributed_available() and torch.distributed.is_initialized()
@@ -171,43 +181,27 @@ def quantize_automodel(self) -> None:
             batch_size=1,
         )
 
-    def prepare_inputs(
-        self, inputs: Dict[str, Any], input_shapes: Dict[str, Any]
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, input_shapes = super().prepare_inputs(inputs, input_shapes)
-
+    def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
         if self.is_dp_distributed:
             if input_shapes["batch_size"] % torch.distributed.get_world_size() != 0:
                 raise ValueError(
-                    f"Batch size {input_shapes['batch_size']} must be divisible by data parallel "
-                    f"world size {torch.distributed.get_world_size()}"
+                    f"Batch size {input_shapes['batch_size']} must be divisible by "
+                    f"data parallel world size {torch.distributed.get_world_size()}"
                 )
-            with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as split_inputs:
-                input_shapes["batch_size"] = input_shapes["batch_size"] // torch.distributed.get_world_size()
-                inputs = split_inputs
-
-        return inputs, input_shapes
+            # distributing batch size across processes
+            input_shapes["batch_size"] //= torch.distributed.get_world_size()
 
-    def prepare_for_inference(self, **kwargs) -> None:
-        if self.config.reshape:
-            static_shapes = {
-                key: value
-                for key, value in kwargs.items()
-                if key in inspect.getfullargspec(self.pretrained_model.reshape).args
-            }
-            if (static_shapes.get("height", None) is not None) and ("sequence_length" in static_shapes):
-                static_shapes["sequence_length"] = kwargs.get("num_channels", 3)
+        # registering input shapes for usage during model reshaping
+        self.input_shapes = input_shapes
 
-            self.logger.info(f"\t+ Reshaping model with static shapes: {static_shapes}")
-            self.pretrained_model.reshape(**static_shapes)
+        return input_shapes
 
-        if self.config.half:
-            self.logger.info("\t+ Converting model to half precision")
-            self.pretrained_model.half()
+    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if self.is_dp_distributed:
+            with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as process_inputs:
+                inputs = process_inputs
 
-        if self.config.reshape or self.config.half:
-            self.logger.info("\t+ Compiling model")
-            self.pretrained_model.compile()
+        return inputs
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         return self.pretrained_model.forward(**inputs, **kwargs)
diff --git a/optimum_benchmark/backends/openvino/utils.py b/optimum_benchmark/backends/openvino/utils.py
index b1005f38..35518346 100644
--- a/optimum_benchmark/backends/openvino/utils.py
+++ b/optimum_benchmark/backends/openvino/utils.py
@@ -1,4 +1,19 @@
-from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
-
-TASKS_TO_OVMODEL = {task: f"optimum.intel.openvino.{ovmodel}" for task, ovmodel in _HEAD_TO_AUTOMODELS.items()}
-TASKS_TO_OVMODEL.update({"feature-extraction": "optimum.intel.openvino.OVModelForFeatureExtraction"})
+TASKS_TO_OVMODEL = {
+    "fill-mask": "optimum.intel.openvino.OVModelForMaskedLM",
+    "text-generation": "optimum.intel.openvino.OVModelForCausalLM",
+    "text2text-generation": "optimum.intel.openvino.OVModelForSeq2SeqLM",
+    "feature-extraction": "optimum.intel.openvino.OVModelForFeatureExtraction",
+    "text-classification": "optimum.intel.openvino.OVModelForSequenceClassification",
+    "token-classification": "optimum.intel.openvino.OVModelForTokenClassification",
+    "question-answering": "optimum.intel.openvino.OVModelForQuestionAnswering",
+    "image-classification": "optimum.intel.openvino.OVModelForImageClassification",
+    "audio-classification": "optimum.intel.openvino.OVModelForAudioClassification",
+    "pix2struct": "optimum.intel.openvino.OVModelForPix2Struct",
+}
+TASKS_TO_MODEL_TYPES_TO_OVPIPELINE = {
+    "text-to-image": {
+        "lcm": "optimum.intel.openvino.OVLatentConsistencyModelPipeline",
+        "stable-diffusion": "optimum.intel.openvino.OVStableDiffusionPipeline",
+        "stable-diffusion-xl": "optimum.intel.openvino.OVStableDiffusionXLPipeline",
+    },
+}
diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index a93d7b5d..6e637a31 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -1,14 +1,15 @@
 import os
 from tempfile import TemporaryDirectory
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 import torch
+from accelerate import init_empty_weights
 from py_txi import TEI, TGI, TEIConfig, TGIConfig
 from safetensors.torch import save_file
 
 from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS
 from ..base import Backend
-from ..transformers_utils import random_init_weights
+from ..transformers_utils import fast_weights_init
 from .config import PyTXIConfig
 
 
@@ -18,10 +19,13 @@ class PyTXIBackend(Backend[PyTXIConfig]):
     def __init__(self, config: PyTXIConfig) -> None:
         super().__init__(config)
 
+    def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
         if self.config.no_weights:
+            self.logger.info("\t+ Creating no weights model")
+            self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights model")
             self.load_model_with_no_weights()
         else:
@@ -43,8 +47,8 @@ def volume(self) -> str:
 
     def download_pretrained_model(self) -> None:
         # directly downloads pretrained model in volume (/data) to change generation config before loading model
-        with torch.device("meta"):
-            self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs, cache_dir=self.volume)
+        with init_empty_weights(include_buffers=True):
+            self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs, cache_dir=self.volume)
 
     def prepare_generation_config(self) -> None:
         self.generation_config.eos_token_id = None
@@ -73,8 +77,8 @@ def create_no_weights_model(self) -> None:
         self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
         # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
         self.logger.info(f"\t+ Loading no weights model from {self.no_weights_model}")
-        with random_init_weights():
-            self.pretrained_model = self.automodel_class.from_pretrained(
+        with fast_weights_init():
+            self.pretrained_model = self.automodel_loader.from_pretrained(
                 self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
         self.logger.info("\t+ Saving no weights model")
@@ -86,14 +90,10 @@ def create_no_weights_model(self) -> None:
             self.logger.info("\t+ Modifying generation config for fixed length generation")
             self.generation_config.eos_token_id = None
             self.generation_config.pad_token_id = None
-
             self.logger.info("\t+ Saving new pretrained generation config")
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_model_with_no_weights(self) -> None:
-        self.logger.info("\t+ Creating no weights model")
-        self.create_no_weights_model()
-
         original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}
         original_model, self.config.model = self.config.model, "/data/no_weights_model"
         self.logger.info("\t+ Loading no weights model")
@@ -139,11 +139,7 @@ def load_model_from_pretrained(self) -> None:
         else:
             raise NotImplementedError(f"TXI does not support task {self.config.task}")
 
-    def prepare_inputs(
-        self, inputs: Dict[str, Any], input_shapes: Dict[str, Any]
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, input_shapes = super().prepare_inputs(inputs, input_shapes)
-
+    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if self.config.task in TEXT_GENERATION_TASKS:
             inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["input_ids"].tolist())}
         elif self.config.task in TEXT_EMBEDDING_TASKS:
@@ -151,7 +147,7 @@ def prepare_inputs(
         else:
             raise NotImplementedError(f"TXI does not support task {self.config.task}")
 
-        return inputs, input_shapes
+        return inputs
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]:
         return self.pretrained_model.encode(**inputs, **kwargs)
diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
index efcbaff4..33914164 100644
--- a/optimum_benchmark/backends/pytorch/backend.py
+++ b/optimum_benchmark/backends/pytorch/backend.py
@@ -1,10 +1,10 @@
 import os
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
-from typing import Any, Callable, Dict, List, Tuple
+from typing import Any, Callable, Dict, List
 
 import torch
-from accelerate import Accelerator
+from accelerate import Accelerator, init_empty_weights, init_on_device
 from datasets import Dataset
 from safetensors.torch import save_file
 from transformers import (
@@ -20,7 +20,7 @@
 from ...import_utils import is_deepspeed_available, is_torch_distributed_available, is_zentorch_available
 from ..base import Backend
 from ..peft_utils import apply_peft
-from ..transformers_utils import random_init_weights
+from ..transformers_utils import fast_weights_init
 from .config import PyTorchConfig
 
 if is_deepspeed_available():
@@ -38,20 +38,12 @@ class PyTorchBackend(Backend[PyTorchConfig]):
 
     def __init__(self, config: PyTorchConfig):
         super().__init__(config)
-        self.validate_library()
-
-        if self.config.deepspeed_inference and self.is_quantized:
-            raise ValueError("Deepspeed-Inference is not compatible with Transformers quantization")
-
-        # Quantization
-        if self.is_quantized:
-            self.logger.info("\t+ Processing quantization config")
-            self.process_quantization_config()
 
         # Threads
         if self.config.inter_op_num_threads is not None:
             self.logger.info(f"\t+ Setting pytorch inter_op_num_threads({self.config.inter_op_num_threads}))")
             torch.set_num_threads(self.config.inter_op_num_threads)
+
         if self.config.intra_op_num_threads is not None:
             self.logger.info(f"\t+ Setting pytorch intra_op_num_threads({self.config.intra_op_num_threads}))")
             torch.set_num_interop_threads(self.config.intra_op_num_threads)
@@ -71,36 +63,111 @@ def __init__(self, config: PyTorchConfig):
                 else:
                     raise ValueError(f"Device {self.config.device} not supported for autocast")
 
+    def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
-        # Model
-        if self.config.no_weights and (self.config.library == "diffusers" or self.config.library == "timm"):
-            raise ValueError("Diffusion pipelines and Timm models don't support no weights")
-        elif self.config.no_weights:
-            self.logger.info("\t+ Loading model with random weights")
-            self.load_model_with_no_weights()
+        if self.config.library == "transformers":
+            self.load_transformers_model()
+        elif self.config.library == "diffusers":
+            self.load_diffusers_model()
+        elif self.config.library == "timm":
+            self.load_timm_model()
         else:
-            self.logger.info("\t+ Loading model with pretrained weights")
-            self.load_model_from_pretrained()
+            raise ValueError(f"Library {self.config.library} not supported for PyTorch backend")
 
+        self.logger.info("\t+ Cleaning up backend temporary directory")
         self.tmpdir.cleanup()
 
+    def load_transformers_model_from_pretrained(self) -> None:
+        if self.is_quantized:
+            self.logger.info(f"\t+ Loading {self.quantization_config.quant_method}-quantized model")
+            self.pretrained_model = self.automodel_loader.from_pretrained(
+                pretrained_model_name_or_path=self.config.model,
+                device_map=self.config.device_map or torch.device(self.config.device),
+                # quantized models are more compatible with device_map dispatcher than (to(device))
+                # using to(device) on quantized models sometimes leaves some layers on cpu or raises
+                # an error because the layers are already on the device
+                **self.config.model_kwargs,
+                **self.automodel_kwargs,
+            )
+        elif self.config.device_map is not None:
+            self.logger.info(f"\t+ Loading Transformers model with device map: {self.config.device_map}")
+            self.pretrained_model = self.automodel_loader.from_pretrained(
+                pretrained_model_name_or_path=self.config.model,
+                device_map=self.config.device_map,
+                **self.config.model_kwargs,
+                **self.automodel_kwargs,
+            )
+        else:
+            self.logger.info("\t+ Loading Transformers model")
+            self.pretrained_model = self.automodel_loader.from_pretrained(
+                pretrained_model_name_or_path=self.config.model, **self.config.model_kwargs, **self.automodel_kwargs
+            )
+            if self.config.device != "cpu":
+                self.logger.info(f"\t+ Moving Transformers model to device: {self.config.device}")
+                self.pretrained_model = self.pretrained_model.to(self.config.device)
+
+    def load_transformers_model_with_no_weights(self) -> None:
+        original_model, self.config.model = self.config.model, self.no_weights_model
+
+        if self.config.deepspeed_inference:
+            with init_empty_weights(include_buffers=False):
+                self.logger.info("\t+ Loading Transformers model on meta device for fast initialization")
+                self.pretrained_model = self.automodel_loader.from_pretrained(
+                    pretrained_model_name_or_path=self.config.model,
+                    **self.config.model_kwargs,
+                    **self.automodel_kwargs,
+                )
+            self.pretrained_model.to_empty(device="cpu")
+        elif self.config.device_map is None and not self.is_quantized:
+            with init_on_device(device=torch.device(self.config.device), include_buffers=True):
+                self.logger.info("\t+ Loading Transformers model using device context manager for fast initialization")
+                self.pretrained_model = self.automodel_loader.from_pretrained(
+                    pretrained_model_name_or_path=self.no_weights_model,
+                    **self.config.model_kwargs,
+                    **self.automodel_kwargs,
+                )
+        else:
+            with fast_weights_init():
+                self.load_transformers_model_from_pretrained()
+
+        self.config.model = original_model
+
+    def load_transformers_model(self):
+        if self.config.deepspeed_inference and self.is_quantized:
+            raise ValueError("Deepspeed-Inference is not compatible with Transformers quantization")
+
+        # Quantization
+        if self.is_quantized:
+            self.logger.info("\t+ Processing quantization config")
+            self.process_quantization_config()
+
+        # Model loading
+        if self.config.no_weights:
+            self.logger.info("\t+ Creating no weights model")
+            self.create_no_weights_model()
+            self.logger.info("\t+ Loading model with random weights")
+            self.load_transformers_model_with_no_weights()
+        else:
+            self.logger.info("\t+ Loading model with pretrained weights")
+            self.load_transformers_model_from_pretrained()
+
         # KV-Cache
         if self.config.cache_implementation is not None:
             self.logger.info(f"\t+ Setting cache implementation to {self.config.cache_implementation}")
             self.pretrained_model.generation_config.cache_implementation = self.config.cache_implementation
 
-        # Eval mode
-        if self.config.eval_mode and self.config.library != "diffusers":
-            self.logger.info("\t+ Turning on model's eval mode")
-            self.pretrained_model.eval()
-
         # BetterTransformer
         if self.config.to_bettertransformer:
-            self.logger.info("\t+ Enabling BetterTransformer")
+            self.logger.info("\t+ To BetterTransformer")
             self.pretrained_model.to_bettertransformer()
 
+        # Eval mode
+        if self.config.eval_mode:
+            self.logger.info("\t+ Enabling eval mode")
+            self.pretrained_model.eval()
+
         # PEFT
         if self.config.peft_type is not None:
             self.logger.info("\t+ Applying PEFT")
@@ -115,87 +182,76 @@ def __init__(self, config: PyTorchConfig):
 
         # Torch compile
         if self.config.torch_compile:
-            if self.config.library == "diffusers":
-                self.logger.info("\t+ Using torch.compile on unet and vae")
-                self.pretrained_model.unet = torch.compile(
-                    self.pretrained_model.unet, **self.config.torch_compile_config
-                )
-                self.pretrained_model.vae.decode = torch.compile(
-                    self.pretrained_model.vae.decode, **self.config.torch_compile_config
+            if self.config.torch_compile_target == "forward":
+                self.logger.info("\t+ Using torch.compile on forward")
+                self.pretrained_model.forward = torch.compile(
+                    self.pretrained_model.forward, **self.config.torch_compile_config
                 )
+            elif self.config.torch_compile_target == "model":
+                self.logger.info("\t+ Using torch.compile on model")
+                self.pretrained_model = torch.compile(self.pretrained_model, **self.config.torch_compile_config)
             else:
-                if self.config.torch_compile_target == "forward":
-                    self.logger.info("\t+ Using torch.compile on forward")
-                    self.pretrained_model.forward = torch.compile(
-                        self.pretrained_model.forward, **self.config.torch_compile_config
-                    )
-                elif self.config.torch_compile_target == "model":
-                    self.logger.info("\t+ Using torch.compile on model")
-                    self.pretrained_model = torch.compile(self.pretrained_model, **self.config.torch_compile_config)
-                else:
-                    raise ValueError(f"Target {self.config.torch_compile_target} not supported")
+                raise ValueError(f"Target {self.config.torch_compile_target} not supported")
+
+    def load_diffusers_pipeline_from_pretrained(self) -> None:
+        self.pretrained_model = self.automodel_loader.from_pretrained(
+            self.config.model,
+            # pretrained_model_name_or_path=self.config.model,
+            # pretrained_model_or_path=self.config.model,
+            device_map=self.config.device_map,
+            **self.config.model_kwargs,
+            **self.automodel_kwargs,
+        )
+        if self.config.device_map is None and self.config.device != "cpu":
+            self.logger.info(f"\t+ Moving Diffusion Pipeline to device: {self.config.device}")
+            self.pretrained_model = self.pretrained_model.to(self.config.device)
 
-    def validate_library(self) -> None:
-        if self.config.library == "timm":
-            self.logger.info(f"\t+ Using Timm's {self.automodel_class.__name__}")
-        elif self.config.library == "diffusers":
-            self.logger.info(f"\t+ Using Diffusers Pipeline {self.automodel_class.__name__}")
-        elif self.config.library == "transformers":
-            self.logger.info(f"\t+ Using AutoModel {self.automodel_class.__name__}")
-        else:
-            raise ValueError(f"Library {self.config.library} not supported")
+    def load_diffusers_model(self):
+        self.logger.info("\t+ Loading Diffusion Pipeline")
+        self.logger.info(f"\t+ Using Diffusers Pipeline {self.automodel_loader.__name__}")
 
-    def load_model_from_pretrained(self) -> None:
-        if self.config.library == "timm":
-            self.logger.info("\t+ Loading Timm model")
-            self.pretrained_model = self.automodel_class(model_name=self.config.model)
-            if self.config.device != "cpu":
-                self.logger.info(f"\t+ Moving Timm model to device: {self.config.device}")
-                self.pretrained_model = self.pretrained_model.to(self.config.device)
+        # Model loading
+        if self.config.no_weights:
+            raise ValueError("No weights model not supported for Diffusers")
+        else:
+            self.load_diffusers_pipeline_from_pretrained()
 
-        elif self.config.library == "diffusers":
-            self.logger.info("\t+ Loading Diffusion Pipeline")
-            self.pretrained_model = self.automodel_class.from_pretrained(
-                # pretrained_model_name_or_path=self.config.model,
-                # pretrained_model_or_path=self.config.model,
-                self.config.model,
-                device_map=self.config.device_map,
-                **self.config.model_kwargs,
-                **self.automodel_kwargs,
+        # Torch compile
+        if self.config.torch_compile:
+            self.logger.info("\t+ Using torch.compile on unet and vae")
+            self.pretrained_model.unet = torch.compile(self.pretrained_model.unet, **self.config.torch_compile_config)
+            self.pretrained_model.vae.decode = torch.compile(
+                self.pretrained_model.vae.decode, **self.config.torch_compile_config
             )
-            if self.config.device_map is None and self.config.device != "cpu":
-                self.logger.info(f"\t+ Moving Diffusion Pipeline to device: {self.config.device}")
-                self.pretrained_model = self.pretrained_model.to(self.config.device)
 
-        elif self.is_quantized:
-            self.logger.info(f"\t+ Loading {self.quantization_config.quant_method}-quantized model")
-            self.pretrained_model = self.automodel_class.from_pretrained(
-                pretrained_model_name_or_path=self.config.model,
-                device_map=self.config.device_map or torch.device(self.config.device),
-                # quantized models are more compatible with device_map dispatcher than (to(device))
-                # using to(device) on quantized models sometimes leaves some layers on cpu or raises
-                # an error because the layers are already on the device
-                **self.config.model_kwargs,
-                **self.automodel_kwargs,
-            )
+    def load_timm_model_form_pretrained(self) -> None:
+        self.pretrained_model = self.automodel_loader(model_name=self.config.model)
+        if self.config.device != "cpu":
+            self.logger.info(f"\t+ Moving Timm model to device: {self.config.device}")
+            self.pretrained_model = self.pretrained_model.to(self.config.device)
 
-        elif self.config.device_map is not None:
-            self.logger.info(f"\t+ Loading Transformers model with device map: {self.config.device_map}")
-            self.pretrained_model = self.automodel_class.from_pretrained(
-                pretrained_model_name_or_path=self.config.model,
-                device_map=self.config.device_map,
-                **self.config.model_kwargs,
-                **self.automodel_kwargs,
-            )
+    def load_timm_model(self):
+        self.logger.info("\t+ Loading Timm model")
+        self.logger.info(f"\t+ Using Timm's {self.automodel_loader.__name__}")
 
+        # Model loading
+        if self.config.no_weights:
+            raise ValueError("No weights model not supported for Timm")
         else:
-            self.logger.info("\t+ Loading Transformers model")
-            self.pretrained_model = self.automodel_class.from_pretrained(
-                pretrained_model_name_or_path=self.config.model, **self.config.model_kwargs, **self.automodel_kwargs
-            )
-            if self.config.device != "cpu":
-                self.logger.info(f"\t+ Moving Transformers model to device: {self.config.device}")
-                self.pretrained_model = self.pretrained_model.to(self.config.device)
+            self.load_timm_model_form_pretrained()
+
+        # Torch compile
+        if self.config.torch_compile:
+            if self.config.torch_compile_target == "forward":
+                self.logger.info("\t+ Using torch.compile on forward")
+                self.pretrained_model.forward = torch.compile(
+                    self.pretrained_model.forward, **self.config.torch_compile_config
+                )
+            elif self.config.torch_compile_target == "model":
+                self.logger.info("\t+ Using torch.compile on model")
+                self.pretrained_model = torch.compile(self.pretrained_model, **self.config.torch_compile_config)
+            else:
+                raise ValueError(f"Target {self.config.torch_compile_target} not supported")
 
     def create_no_weights_model(self) -> None:
         if self.pretrained_config is None:
@@ -209,8 +265,8 @@ def create_no_weights_model(self) -> None:
 
         if self.is_exllamav2:
             self.logger.info("\t+ Adding g_idx to no weights model state dict")
-            with torch.device("meta"):
-                meta_model = self.automodel_class.from_config(self.pretrained_config)
+            with init_empty_weights(include_buffers=False):
+                meta_model = self.automodel_loader.from_config(self.pretrained_config)
             for name, module in meta_model.named_modules():
                 if hasattr(module, "in_features"):
                     state_dict[name + ".g_idx"] = torch.ones((module.in_features,), dtype=torch.int32)
@@ -227,38 +283,6 @@ def create_no_weights_model(self) -> None:
         self.logger.info("\t+ Saving no weights model pretrained config")
         self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
 
-    def load_model_with_no_weights(self) -> None:
-        self.create_no_weights_model()
-        original_model, self.config.model = self.config.model, self.no_weights_model
-
-        if self.config.deepspeed_inference:
-            with torch.device("meta"):
-                # with big models, loading no_weights_model is very slow (randomizing every weight)
-                # so we load the model on meta device to speed up the process and then move it to cpu
-                self.logger.info("\t+ Loading Transformers model on meta device for fast initialization")
-                self.pretrained_model = self.automodel_class.from_pretrained(
-                    pretrained_model_name_or_path=self.config.model,
-                    **self.config.model_kwargs,
-                    **self.automodel_kwargs,
-                )
-            self.logger.info("\t+ Materializing meta model on CPU to avoid OOM")
-            self.pretrained_model.to_empty(device="cpu")
-
-        elif not self.is_quantized and self.config.device_map is None:
-            with torch.device(self.config.device):
-                self.logger.info("\t+ Loading Transformers model using device context manager for fast initialization")
-                self.pretrained_model = self.automodel_class.from_pretrained(
-                    pretrained_model_name_or_path=self.no_weights_model,
-                    **self.config.model_kwargs,
-                    **self.automodel_kwargs,
-                )
-
-        else:
-            with random_init_weights():
-                self.load_model_from_pretrained()
-
-        self.config.model = original_model
-
     def process_quantization_config(self) -> None:
         if self.is_gptq_quantized:
             self.logger.info("\t+ Processing GPTQ config")
@@ -353,31 +377,34 @@ def automodel_kwargs(self) -> Dict[str, Any]:
             kwargs["low_cpu_mem_usage"] = self.config.low_cpu_mem_usage
 
         if self.config.no_weights:
-            # we use our own context manager to load the model with random weights
+            # we use our own context manager to load the
+            # model with faster random weights generators
             kwargs["_fast_init"] = False
 
         return kwargs
 
-    def prepare_inputs(
-        self, inputs: Dict[str, Any], input_shapes: Dict[str, Any]
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, input_shapes = super().prepare_inputs(inputs, input_shapes)
-
+    def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
         if self.is_dp_distributed:
             if input_shapes["batch_size"] % torch.distributed.get_world_size() != 0:
                 raise ValueError(
-                    f"Batch size {input_shapes['batch_size']} must be divisible by data parallel "
-                    f"world size {torch.distributed.get_world_size()}"
+                    f"Batch size {input_shapes['batch_size']} must be divisible by "
+                    f"data parallel world size {torch.distributed.get_world_size()}"
                 )
-            with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as split_inputs:
-                input_shapes["batch_size"] = input_shapes["batch_size"] // torch.distributed.get_world_size()
-                inputs = split_inputs
+            # distributing batch size across processes
+            input_shapes["batch_size"] //= torch.distributed.get_world_size()
 
         if self.is_tp_distributed:
             if torch.distributed.get_rank() != 0:
-                # this is to force throughput of non main shards to 0
+                # zeroing throughput on other ranks
                 input_shapes["batch_size"] = 0
 
+        return input_shapes
+
+    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if self.is_dp_distributed:
+            with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as process_inputs:
+                inputs = process_inputs
+
         if self.config.library == "timm":
             inputs = {"x": inputs["pixel_values"]}
 
@@ -385,7 +412,7 @@ def prepare_inputs(
             if isinstance(value, torch.Tensor):
                 inputs[key] = value.to(self.config.device)
 
-        return inputs, input_shapes
+        return inputs
 
     @torch.inference_mode()
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
@@ -393,6 +420,9 @@ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict
 
     @torch.inference_mode()
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
+        assert (
+            kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1
+        ), "For prefilling, max_new_tokens and min_new_tokens must be equal to 1"
         return self.pretrained_model.generate(**inputs, **kwargs)
 
     @torch.inference_mode()
diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py
index a4bc1c39..a05187c3 100644
--- a/optimum_benchmark/backends/tensorrt_llm/backend.py
+++ b/optimum_benchmark/backends/tensorrt_llm/backend.py
@@ -1,11 +1,8 @@
-import os
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
 from typing import Any, Dict
 
-import torch
 from hydra.utils import get_class
-from safetensors.torch import save_file
 
 from ..base import Backend
 from .config import TRTLLMConfig
@@ -17,37 +14,25 @@ class TRTLLMBackend(Backend[TRTLLMConfig]):
 
     def __init__(self, config: TRTLLMConfig):
         super().__init__(config)
-        self.validate_model_type()
 
+        if self.config.model_type in MODEL_TYPE_TO_TRTLLMMODEL:
+            self.trtllm_loader = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.config.model_type])
+            self.logger.info(f"\t+ Using TRTLLMModel class {self.trtllm_loader.__name__}")
+        else:
+            raise NotImplementedError(f"TRTLLMBackend does not support model_type {self.config.model_type}")
+
+    def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
         self.logger.info("\t+ Loading pretrained TRTLLMModel")
         self.load_trtmodel_from_pretrained()
 
+        self.logger.info("\t+ Cleaning up backend temporary directory")
         self.tmpdir.cleanup()
 
-    def validate_model_type(self) -> None:
-        if self.model_type not in MODEL_TYPE_TO_TRTLLMMODEL:
-            raise NotImplementedError(f"TRTLLMBackend does not support model_type {self.model_type}")
-
-        self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.model_type])
-        self.logger.info(f"\t+ Using TRTLLMModel class {self.trtmodel_class.__name__}")
-
-    def create_no_weights_model(self) -> None:
-        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
-        self.logger.info("\t+ Creating no weights model state dict")
-        state_dict = torch.nn.Linear(1, 1).state_dict()
-        self.logger.info("\t+ Saving no weights model safetensors")
-        safetensors = os.path.join(self.no_weights_model, "model.safetensors")
-        save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"})
-
-        if self.config.library == "transformers":
-            self.logger.info("\t+ Saving no weights model pretrained config")
-            self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
-
     def load_trtmodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.trtmodel_class.from_pretrained(
+        self.pretrained_model = self.trtllm_loader.from_pretrained(
             self.config.model,
             tp=self.config.tp,
             pp=self.config.pp,
diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py
index 22a017f9..77ed3000 100644
--- a/optimum_benchmark/backends/timm_utils.py
+++ b/optimum_benchmark/backends/timm_utils.py
@@ -5,18 +5,19 @@
 from ..import_utils import is_timm_available
 
 if is_timm_available():
-    import timm  # type: ignore
+    from timm import create_model
+    from timm.models import get_pretrained_cfg, load_model_config_from_hf, parse_model_name
 
 
 def get_timm_pretrained_config(model_name: str) -> PretrainedConfig:
-    model_source, model_name = timm.models.parse_model_name(model_name)
+    model_source, model_name = parse_model_name(model_name)
     if model_source == "hf-hub":
         # For model names specified in the form `hf-hub:path/architecture_name@revision`,
         # load model weights + pretrained_cfg from Hugging Face hub.
-        pretrained_cfg, model_name = timm.models.load_model_config_from_hf(model_name)
+        pretrained_cfg, model_name = load_model_config_from_hf(model_name)
         return pretrained_cfg
 
-    return timm.get_pretrained_cfg(model_name)
+    return get_pretrained_cfg(model_name)
 
 
 def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]:
@@ -70,3 +71,7 @@ def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]:
         shapes["num_labels"] = num_classes
 
     return shapes
+
+
+def get_timm_automodel_loader():
+    return create_model
diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py
index 5f915001..61401a75 100644
--- a/optimum_benchmark/backends/torch_ort/backend.py
+++ b/optimum_benchmark/backends/torch_ort/backend.py
@@ -1,16 +1,14 @@
-import os
 from tempfile import TemporaryDirectory
 from typing import Any, Callable, Dict, List
 
 import torch
 from datasets import Dataset
 from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments
-from safetensors.torch import save_file
 from transformers import TrainerCallback
 
 from ..base import Backend
 from ..peft_utils import apply_peft
-from ..transformers_utils import random_init_weights
+from ..transformers_utils import fast_weights_init
 from .config import TorchORTConfig
 
 
@@ -19,12 +17,14 @@ class TorchORTBackend(Backend[TorchORTConfig]):
 
     def __init__(self, config: TorchORTConfig):
         super().__init__(config)
-        self.validate_library()
 
+    def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
         if self.config.no_weights:
+            self.logger.info("\t+ Creating no weights AutoModel")
+            self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights AutoModel")
             self.load_automodel_with_no_weights()
         else:
@@ -35,43 +35,22 @@ def __init__(self, config: TorchORTConfig):
             self.logger.info("\t+ Applying PEFT")
             self.pretrained_model = apply_peft(self.pretrained_model, self.config.peft_type, self.config.peft_config)
 
+        self.logger.info("\t+ Cleaning up backend temporary directory")
         self.tmpdir.cleanup()
 
-    def validate_library(self) -> None:
-        if self.config.library == "transformers":
-            self.logger.info(f"Using AutoModel class {self.automodel_class.__name__}")
-        else:
-            raise NotImplementedError(f"TorchORTBackend does not support {self.config.library} library")
-
-    def create_no_weights_model(self) -> None:
-        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
-        self.logger.info("\t+ Creating no weights model directory")
-        os.makedirs(self.no_weights_model, exist_ok=True)
-        self.logger.info("\t+ Creating no weights model state dict")
-        state_dict = torch.nn.Linear(1, 1).state_dict()
-        self.logger.info("\t+ Saving no weights model safetensors")
-        safetensors = os.path.join(self.no_weights_model, "model.safetensors")
-        save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"})
-
-        if self.config.library == "transformers":
-            self.logger.info("\t+ Saving no weights model pretrained config")
-            self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
-
     def load_automodel_with_no_weights(self) -> None:
-        self.logger.info("\t+ Creating no weights model")
-        self.create_no_weights_model()
+        original_model, self.config.model = self.config.model, self.no_weights_model
 
-        with random_init_weights():
-            original_model, self.config.model = self.config.model, self.no_weights_model
-            self.logger.info("\t+ Loading no weights AutoModel")
+        with fast_weights_init():
             self.load_automodel_from_pretrained()
-            self.config.model = original_model
 
         self.logger.info("\t+ Tying model weights")
         self.pretrained_model.tie_weights()
 
+        self.config.model = original_model
+
     def load_automodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.automodel_class.from_pretrained(
+        self.pretrained_model = self.automodel_loader.from_pretrained(
             self.config.model, **self.automodel_kwargs, **self.config.model_kwargs
         ).to(self.config.device)
 
diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 2ae02100..87755e78 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -2,19 +2,70 @@
 from typing import Any, Dict, Optional, Union
 
 import torch
+import transformers
 from transformers import (
     AutoConfig,
+    AutoFeatureExtractor,
     AutoProcessor,
     AutoTokenizer,
     FeatureExtractionMixin,
     GenerationConfig,
     ImageProcessingMixin,
     PretrainedConfig,
-    PreTrainedTokenizer,
     ProcessorMixin,
+    SpecialTokensMixin,
 )
 
-PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, PreTrainedTokenizer, ProcessorMixin]
+from ..import_utils import is_torch_available
+
+TASKS_TO_MODEL_LOADERS = {
+    # text processing
+    "feature-extraction": "AutoModel",
+    "fill-mask": "AutoModelForMaskedLM",
+    "multiple-choice": "AutoModelForMultipleChoice",
+    "question-answering": "AutoModelForQuestionAnswering",
+    "token-classification": "AutoModelForTokenClassification",
+    "text-classification": "AutoModelForSequenceClassification",
+    # audio processing
+    "audio-xvector": "AutoModelForAudioXVector",
+    "text-to-audio": "AutoModelForTextToSpectrogram",
+    "audio-classification": "AutoModelForAudioClassification",
+    "audio-frame-classification": "AutoModelForAudioFrameClassification",
+    # image processing
+    "mask-generation": "AutoModel",
+    "image-to-image": "AutoModelForImageToImage",
+    "masked-im": "AutoModelForMaskedImageModeling",
+    "object-detection": "AutoModelForObjectDetection",
+    "depth-estimation": "AutoModelForDepthEstimation",
+    "image-segmentation": "AutoModelForImageSegmentation",
+    "image-classification": "AutoModelForImageClassification",
+    "semantic-segmentation": "AutoModelForSemanticSegmentation",
+    "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection",
+    "zero-shot-image-classification": "AutoModelForZeroShotImageClassification",
+    # text generation
+    "image-to-text": "AutoModelForVision2Seq",
+    "text-generation": "AutoModelForCausalLM",
+    "text2text-generation": "AutoModelForSeq2SeqLM",
+    "visual-question-answering": "AutoModelForVisualQuestionAnswering",
+    "automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"),
+}
+
+
+if is_torch_available():
+    TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {}
+    for task_name, model_loaders in TASKS_TO_MODEL_LOADERS.items():
+        TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name] = {}
+
+        if isinstance(model_loaders, str):
+            model_loaders = (model_loaders,)
+
+        for model_loader_name in model_loaders:
+            model_loader_class = getattr(transformers, model_loader_name)
+            TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES[task_name].update(model_loader_class._model_mapping._model_mapping)
+else:
+    TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES = {}
+
+PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, SpecialTokensMixin, ProcessorMixin]
 
 
 def get_transformers_pretrained_config(model: str, **kwargs) -> "PretrainedConfig":
@@ -36,9 +87,12 @@ def get_transformers_pretrained_processor(model: str, **kwargs) -> Optional["Pre
         return AutoProcessor.from_pretrained(model, **kwargs)
     except Exception:
         try:
-            return AutoTokenizer.from_pretrained(model, **kwargs)
+            return AutoFeatureExtractor.from_pretrained(model, **kwargs)
         except Exception:
-            return None
+            try:
+                return AutoTokenizer.from_pretrained(model, **kwargs)
+            except Exception:
+                return None
 
 
 def extract_transformers_shapes_from_artifacts(
@@ -114,6 +168,12 @@ def extract_transformers_shapes_from_artifacts(
     return shapes
 
 
+def get_transformers_automodel_loader_for_task(task: str):
+    model_loader_name = TASKS_TO_MODEL_LOADERS[task]
+    model_loader_class = getattr(transformers, model_loader_name)
+    return model_loader_class
+
+
 TORCH_INIT_FUNCTIONS = {
     "normal_": torch.nn.init.normal_,
     "uniform_": torch.nn.init.uniform_,
@@ -131,20 +191,20 @@ def extract_transformers_shapes_from_artifacts(
 }
 
 
-def fast_rand(tensor: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+def fast_random_tensor(tensor: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
     return torch.nn.init.uniform_(tensor)
 
 
 @contextmanager
-def random_init_weights():
+def fast_weights_init():
     # Replace the initialization functions
     for name, init_func in TORCH_INIT_FUNCTIONS.items():
-        if name != "uniform_":
-            setattr(torch.nn.init, name, fast_rand)
+        if name != "uniform_":  # avoid recursion
+            setattr(torch.nn.init, name, fast_random_tensor)
     try:
         yield
     finally:
         # Restore the original initialization functions
         for name, init_func in TORCH_INIT_FUNCTIONS.items():
-            if name != "uniform_":
+            if name != "uniform_":  # avoid recursion
                 setattr(torch.nn.init, name, init_func)
diff --git a/optimum_benchmark/backends/vllm/backend.py b/optimum_benchmark/backends/vllm/backend.py
index a0833477..1a28de4a 100644
--- a/optimum_benchmark/backends/vllm/backend.py
+++ b/optimum_benchmark/backends/vllm/backend.py
@@ -1,6 +1,6 @@
 import os
 from tempfile import TemporaryDirectory
-from typing import Any, Dict, Tuple
+from typing import Any, Dict
 
 import torch
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
@@ -9,7 +9,7 @@
 
 from ...task_utils import TEXT_GENERATION_TASKS
 from ..base import Backend
-from ..transformers_utils import random_init_weights
+from ..transformers_utils import fast_weights_init
 from .config import VLLMConfig
 
 
@@ -18,29 +18,34 @@ class VLLMBackend(Backend[VLLMConfig]):
 
     def __init__(self, config: VLLMConfig) -> None:
         super().__init__(config)
-        self.validate_task()
 
+        if self.config.task not in TEXT_GENERATION_TASKS:
+            raise NotImplementedError(f"vLLM does not support task {self.config.task}")
+
+    def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
         if self.config.no_weights:
+            self.logger.info("\t+ Creating no weights model")
+            self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights model")
             self.load_model_with_no_weights()
         else:
             self.logger.info("\t+ Downloading pretrained model")
             self.download_pretrained_model()
-
-            self.logger.info("\t+ Preparing generation config")
-            self.prepare_generation_config()
-
+            if self.config.task in TEXT_GENERATION_TASKS:
+                self.logger.info("\t+ Preparing generation config")
+                self.prepare_generation_config()
             self.logger.info("\t+ Loading pretrained model")
             self.load_model_from_pretrained()
 
+        self.logger.info("\t+ Cleaning up backend temporary directory")
         self.tmpdir.cleanup()
 
     def download_pretrained_model(self) -> None:
         with torch.device("meta"):
-            self.automodel_class.from_pretrained(self.config.model, **self.config.model_kwargs)
+            self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
 
     def prepare_generation_config(self) -> None:
         self.generation_config.eos_token_id = None
@@ -69,8 +74,8 @@ def create_no_weights_model(self) -> None:
         self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
         # unlike Transformers, vLLM won't accept any missing tensors so we need to materialize the model
         self.logger.info(f"\t+ Loading no weights model from {self.no_weights_model}")
-        with random_init_weights():
-            self.pretrained_model = self.automodel_class.from_pretrained(
+        with fast_weights_init():
+            self.pretrained_model = self.automodel_loader.from_pretrained(
                 self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
         self.logger.info("\t+ Saving no weights model")
@@ -82,14 +87,10 @@ def create_no_weights_model(self) -> None:
             self.logger.info("\t+ Modifying generation config for fixed length generation")
             self.generation_config.eos_token_id = None
             self.generation_config.pad_token_id = None
-
             self.logger.info("\t+ Saving new pretrained generation config")
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_model_with_no_weights(self) -> None:
-        self.logger.info("\t+ Creating no weights model")
-        self.create_no_weights_model()
-
         original_model, self.config.model = self.config.model, self.no_weights_model
         self.logger.info("\t+ Loading no weights model")
         self.load_model_from_pretrained()
@@ -125,21 +126,13 @@ def load_model_from_pretrained(self) -> None:
             seed=self.config.seed,
         )
 
-    def validate_task(self) -> None:
-        if self.config.task not in ["text-generation"]:
-            raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")
-
-    def prepare_inputs(
-        self, inputs: Dict[str, Any], input_shapes: Dict[str, Any]
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, input_shapes = super().prepare_inputs(inputs, input_shapes)
-
+    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if self.config.task in TEXT_GENERATION_TASKS:
             inputs = {"prompts": self.pretrained_processor.batch_decode(inputs["input_ids"])}
         else:
             raise NotImplementedError(f"vLLM does not support task {self.config.task}")
 
-        return inputs, input_shapes
+        return inputs
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
         return self.pretrained_model.generate(
@@ -163,6 +156,7 @@ def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Dict[str, A
             use_tqdm=False,
             sampling_params=SamplingParams(
                 ignore_eos=True,
+                detokenize=True,
                 seed=self.config.seed,
                 n=kwargs.get("num_return_sequences"),
                 max_tokens=kwargs.get("max_new_tokens"),
@@ -178,6 +172,7 @@ def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
             use_tqdm=False,
             sampling_params=SamplingParams(
                 ignore_eos=True,
+                detokenize=True,
                 n=kwargs.get("num_return_sequences"),
                 max_tokens=kwargs.get("max_new_tokens"),
                 min_tokens=kwargs.get("min_new_tokens"),
diff --git a/optimum_benchmark/launchers/process/launcher.py b/optimum_benchmark/launchers/process/launcher.py
index 2067a841..fbc990e1 100644
--- a/optimum_benchmark/launchers/process/launcher.py
+++ b/optimum_benchmark/launchers/process/launcher.py
@@ -21,11 +21,12 @@ def __init__(self, config: ProcessConfig):
         if get_start_method(allow_none=True) != self.config.start_method:
             self.logger.info(f"\t+ Setting multiprocessing start method to {self.config.start_method}")
             set_start_method(self.config.start_method, force=True)
-            self.logger.info("\t+ Warming up multiprocessing context")
             # creates the resource tracker with default executable
-            dummy_process = Process()
+            self.logger.info("\t+ Warming up multiprocessing context")
+            dummy_process = Process(target=dummy_target, daemon=False)
             dummy_process.start()
             dummy_process.join()
+            dummy_process.close()
 
     def launch(self, worker: Callable[..., BenchmarkReport], worker_args: List[Any]) -> BenchmarkReport:
         child_connection, parent_connection = Pipe()
@@ -110,3 +111,7 @@ def target(
         logger.info("\t+ Exiting isolated process")
         connection.close()
         exit(0)
+
+
+def dummy_target() -> None:
+    exit(0)
diff --git a/optimum_benchmark/scenarios/energy_star/scenario.py b/optimum_benchmark/scenarios/energy_star/scenario.py
index fbb4c1ed..3bf003ff 100644
--- a/optimum_benchmark/scenarios/energy_star/scenario.py
+++ b/optimum_benchmark/scenarios/energy_star/scenario.py
@@ -144,11 +144,12 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport:
 
         LOGGER.info("\t+ Preparing backend for Inference")
         backend.prepare_for_inference(
-            **backend.model_shapes,
-            **self.config.input_shapes,
-            **self.config.generate_kwargs,
-            **self.config.forward_kwargs,
-            **self.config.call_kwargs,
+            input_shapes=self.config.input_shapes,
+            inference_kwargs={
+                **self.config.generate_kwargs,
+                **self.config.forward_kwargs,
+                **self.config.call_kwargs,
+            },
         )
 
         LOGGER.info("\t+ Warming up backend for Inference")
diff --git a/optimum_benchmark/scenarios/inference/config.py b/optimum_benchmark/scenarios/inference/config.py
index 3ceb8895..2c05d97f 100644
--- a/optimum_benchmark/scenarios/inference/config.py
+++ b/optimum_benchmark/scenarios/inference/config.py
@@ -18,15 +18,23 @@ class InferenceConfig(ScenarioConfig):
     # benchmark options
     iterations: int = field(
         default=10,
-        metadata={"help": "Minimum number of iterations to run the benchmark, set to 0 to disable this constraint"},
+        metadata={
+            "help": "Minimum number of iterations to run the benchmark. "
+            "The number of tracked inferences will be at least this value."
+            "Set to 0 to disable this constraint (benchmark will run for `duration` seconds)."
+        },
     )
     duration: int = field(
         default=10,
-        metadata={"help": "Minimum duration of the benchmark in seconds, set to 0 to disable this constraint"},
+        metadata={
+            "help": "Minimum duration of the benchmark in seconds. "
+            "The sum of tracked inferences will be at least this value."
+            "Set to 0 to disable this constraint (benchmark will run for `iterations` iterations)."
+        },
     )
     warmup_runs: int = field(
         default=10,
-        metadata={"help": "Number of warmup runs to perform before benchmarking, set to 0 to disable warmup"},
+        metadata={"help": "Number of warmup runs to perform before benchmarking."},
     )
 
     # input/output config
@@ -40,8 +48,8 @@ class InferenceConfig(ScenarioConfig):
     )
 
     # tracking options
-    latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"})
     memory: bool = field(default=False, metadata={"help": "Measure max memory usage"})
+    latency: bool = field(default=True, metadata={"help": "Measure latencies and throughputs"})
     energy: bool = field(default=False, metadata={"help": "Measure energy usage and efficiency"})
 
     # methods kwargs
diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py
index 2d327df1..c0d9475e 100644
--- a/optimum_benchmark/scenarios/inference/scenario.py
+++ b/optimum_benchmark/scenarios/inference/scenario.py
@@ -1,4 +1,5 @@
 import time
+from contextlib import ExitStack
 
 from transformers import LogitsProcessorList
 
@@ -68,52 +69,27 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport:
             self.logger.info("\t+ Updating Text Generation kwargs with default values")
             self.config.generate_kwargs = {**TEXT_GENERATION_DEFAULT_KWARGS, **self.config.generate_kwargs}
             self.logger.info("\t+ Initializing Text Generation report")
-
-            self.report = BenchmarkReport.from_list(targets=["prefill", "decode", "per_token"])
-
+            self.report = BenchmarkReport.from_list(targets=["load", "prefill", "decode", "per_token"])
         elif backend.config.task in IMAGE_DIFFUSION_TASKS:
             self.logger.info("\t+ Generating Image Diffusion inputs")
             self.inputs = self.input_generator()
             self.logger.info("\t+ Updating Image Diffusion kwargs with default values")
             self.config.call_kwargs = {**IMAGE_DIFFUSION_DEFAULT_KWARGS, **self.config.call_kwargs}
             self.logger.info("\t+ Initializing Image Diffusion report")
-            self.report = BenchmarkReport.from_list(targets=["call"])
-
+            self.report = BenchmarkReport.from_list(targets=["load", "call"])
         else:
             self.logger.info("\t+ Generating Inference inputs")
             self.inputs = self.input_generator()
             self.logger.info("\t+ Initializing Inference report")
-            self.report = BenchmarkReport.from_list(targets=["forward"])
+            self.report = BenchmarkReport.from_list(targets=["load", "forward"])
 
-        self.logger.info("\t+ Preparing inputs for Inference")
-        self.inputs, self.config.input_shapes = backend.prepare_inputs(
-            inputs=self.inputs, input_shapes=self.config.input_shapes
-        )
+        self.logger.info("\t+ Preparing input shapes for Inference")
+        self.config.input_shapes = backend.prepare_input_shapes(input_shapes=self.config.input_shapes)
 
-        self.logger.info("\t+ Preparing backend for Inference")
-        backend.prepare_for_inference(
-            input_shapes=self.config.input_shapes,
-            inference_kwargs={
-                **self.config.generate_kwargs,
-                **self.config.forward_kwargs,
-                **self.config.call_kwargs,
-            },
-        )
+        self.run_model_loading_tracking(backend)
 
-        if backend.config.task in TEXT_GENERATION_TASKS:
-            self.logger.info("\t+ Warming up backend for Text Generation")
-            _ = backend.generate(self.inputs, self.config.generate_kwargs)
-            for _ in range(self.config.warmup_runs):
-                _ = backend.generate(self.inputs, {**self.config.generate_kwargs, **TEXT_GENERATION_WARMUP_OVERRIDES})
-        elif backend.config.task in IMAGE_DIFFUSION_TASKS:
-            self.logger.info("\t+ Warming up backend for Image Diffusion")
-            _ = backend.call(self.inputs, self.config.call_kwargs)
-            for _ in range(self.config.warmup_runs):
-                _ = backend.call(self.inputs, {**self.config.call_kwargs, **IMAGE_DIFFUSION_WARMUP_OVERRIDES})
-        else:
-            self.logger.info("\t+ Warming up backend for Inference")
-            for _ in range(self.config.warmup_runs):
-                _ = backend.forward(self.inputs, self.config.forward_kwargs)
+        self.logger.info("\t+ Preparing inputs for Inference")
+        self.inputs = backend.prepare_inputs(inputs=self.inputs)
 
         if self.config.memory:
             if backend.config.task in TEXT_GENERATION_TASKS:
@@ -125,6 +101,15 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport:
 
             self.report.log_memory()
 
+        if self.config.latency or self.config.energy:
+            # latency and energy are metrics that require some warmup
+            if backend.config.task in TEXT_GENERATION_TASKS:
+                self.warmup_text_generation(backend)
+            elif backend.config.task in IMAGE_DIFFUSION_TASKS:
+                self.warmup_image_diffusion(backend)
+            else:
+                self.warmup_inference(backend)
+
         if self.config.latency:
             if backend.config.task in TEXT_GENERATION_TASKS:
                 if backend.config.name in PER_TOKEN_BACKENDS:
@@ -152,6 +137,57 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport:
 
         return self.report
 
+    def warmup_text_generation(self, backend: Backend[BackendConfigT]):
+        self.logger.info("\t+ Warming up backend for Text Generation")
+        _ = backend.generate(self.inputs, self.config.generate_kwargs)
+        for _ in range(self.config.warmup_runs):
+            _ = backend.generate(self.inputs, {**self.config.generate_kwargs, **TEXT_GENERATION_WARMUP_OVERRIDES})
+
+    def warmup_image_diffusion(self, backend: Backend[BackendConfigT]):
+        self.logger.info("\t+ Warming up backend for Image Diffusion")
+        _ = backend.call(self.inputs, self.config.call_kwargs)
+        for _ in range(self.config.warmup_runs):
+            _ = backend.call(self.inputs, {**self.config.call_kwargs, **IMAGE_DIFFUSION_WARMUP_OVERRIDES})
+
+    def warmup_inference(self, backend: Backend[BackendConfigT]):
+        self.logger.info("\t+ Warming up backend for Inference")
+        for _ in range(self.config.warmup_runs):
+            _ = backend.forward(self.inputs, self.config.forward_kwargs)
+
+    # Loading tracking
+    def run_model_loading_tracking(self, backend: Backend[BackendConfigT]):
+        self.logger.info("\t+ Running model loading tracking")
+
+        if self.config.latency:
+            latency_tracker = LatencyTracker(backend=backend.config.name, device=backend.config.device)
+        if self.config.memory:
+            memory_tracker = MemoryTracker(
+                backend=backend.config.name, device=backend.config.device, device_ids=backend.config.device_ids
+            )
+        if self.config.energy:
+            energy_tracker = EnergyTracker(
+                backend=backend.config.name, device=backend.config.device, device_ids=backend.config.device_ids
+            )
+
+        context_stack = ExitStack()
+        if self.config.latency:
+            context_stack.enter_context(latency_tracker.track())
+        if self.config.memory:
+            context_stack.enter_context(memory_tracker.track())
+        if self.config.energy:
+            context_stack.enter_context(energy_tracker.track())
+
+        with context_stack:
+            self.logger.info("\t+ Loading model for Inference")
+            backend.load()
+
+        if self.config.latency:
+            self.report.load.latency = latency_tracker.get_latency()
+        if self.config.memory:
+            self.report.load.memory = memory_tracker.get_max_memory()
+        if self.config.energy:
+            self.report.load.energy = energy_tracker.get_energy()
+
     ## Memory tracking
     def run_text_generation_memory_tracking(self, backend: Backend[BackendConfigT]):
         self.logger.info("\t+ Running Text Generation memory tracking")
@@ -272,7 +308,7 @@ def run_image_diffusion_latency_tracking(self, backend: Backend[BackendConfigT])
         )
 
     def run_latency_inference_tracking(self, backend: Backend[BackendConfigT]):
-        self.logger.info("\t+ Running latency tracking")
+        self.logger.info("\t+ Running Inference latency tracking")
         latency_tracker = LatencyTracker(backend=backend.config.name, device=backend.config.device)
 
         while latency_tracker.elapsed() < self.config.duration or latency_tracker.count() < self.config.iterations:
diff --git a/optimum_benchmark/scenarios/training/scenario.py b/optimum_benchmark/scenarios/training/scenario.py
index e7fc67fe..d42fc269 100644
--- a/optimum_benchmark/scenarios/training/scenario.py
+++ b/optimum_benchmark/scenarios/training/scenario.py
@@ -33,29 +33,38 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport:
         self.logger.info("\t+ Initializing training report")
         self.report = BenchmarkReport.from_list(targets=["overall", "warmup", "train"])
 
+        self.logger.info("\t+ Loading model into backend")
+        backend.load()
+
         training_callbackes = []
+
         if self.config.latency:
-            self.logger.info("\t+ Adding latency measuring callback")
+            self.logger.info("\t+ Creating latency tracking callback")
             latency_callback = StepLatencyTrainerCallback(device=backend.config.device, backend=backend.config.name)
+            self.logger.info("\t+ Adding latency measuring callback")
             training_callbackes.append(latency_callback)
 
-        training_trackers = []
+        context_stack = ExitStack()
+
         if self.config.memory:
-            self.logger.info("\t+ Adding memory tracking context manager")
+            self.logger.info("\t+ Creating memory tracking context manager")
             memory_tracker = MemoryTracker(
                 device=backend.config.device, backend=backend.config.name, device_ids=backend.config.device_ids
             )
-            training_trackers.append(memory_tracker.track())
 
         if self.config.energy:
-            self.logger.info("\t+ Adding energy tracking context manager")
+            self.logger.info("\t+ Creating energy tracking context manager")
             energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids)
-            training_trackers.append(energy_tracker.track())
 
-        with ExitStack() as stack:
-            for tracker in training_trackers:
-                stack.enter_context(tracker)
+        if self.config.memory:
+            self.logger.info("\t+ Entering memory tracking context manager")
+            context_stack.enter_context(memory_tracker.track())
+
+        if self.config.energy:
+            self.logger.info("\t+ Entering energy tracking context manager")
+            context_stack.enter_context(energy_tracker.track())
 
+        with context_stack:
             backend.train(
                 training_dataset=training_dataset,
                 training_callbacks=training_callbackes,
diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py
index cf1701b5..4587097f 100644
--- a/optimum_benchmark/task_utils.py
+++ b/optimum_benchmark/task_utils.py
@@ -1,59 +1,24 @@
 import importlib
+import json
 import os
 from typing import Optional
 
 import huggingface_hub
 
-_TRANSFORMERS_TASKS_TO_MODEL_LOADERS = {
-    # text processing
-    "feature-extraction": "AutoModel",
-    "fill-mask": "AutoModelForMaskedLM",
-    "multiple-choice": "AutoModelForMultipleChoice",
-    "question-answering": "AutoModelForQuestionAnswering",
-    "token-classification": "AutoModelForTokenClassification",
-    "text-classification": "AutoModelForSequenceClassification",
-    # audio processing
-    "audio-xvector": "AutoModelForAudioXVector",
-    "text-to-audio": "AutoModelForTextToSpectrogram",
-    "audio-classification": "AutoModelForAudioClassification",
-    "audio-frame-classification": "AutoModelForAudioFrameClassification",
-    "conversational": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"),
-    # image processing
-    "mask-generation": "AutoModel",
-    "image-to-image": "AutoModelForImageToImage",
-    "masked-im": "AutoModelForMaskedImageModeling",
-    "object-detection": "AutoModelForObjectDetection",
-    "depth-estimation": "AutoModelForDepthEstimation",
-    "image-classification": "AutoModelForImageClassification",
-    "semantic-segmentation": "AutoModelForSemanticSegmentation",
-    "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection",
-    "zero-shot-image-classification": "AutoModelForZeroShotImageClassification",
-    "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"),
-    # text generation
-    "image-to-text": "AutoModelForVision2Seq",
-    "text-generation": "AutoModelForCausalLM",
-    "text2text-generation": "AutoModelForSeq2SeqLM",
-    "visual-question-answering": "AutoModelForVisualQuestionAnswering",
-    "automatic-speech-recognition": ("AutoModelForSpeechSeq2Seq", "AutoModelForCTC"),
-}
-
-_DIFFUSERS_TASKS_TO_MODEL_LOADERS = {
-    "inpainting": "AutoPipelineForInpainting",
-    "text-to-image": "AutoPipelineForText2Image",
-    "image-to-image": "AutoPipelineForImage2Image",
-    "stable-diffusion": "StableDiffusionPipeline",  # should be deprecated
-    "stable-diffusion-xl": "StableDiffusionXLImg2ImgPipeline",  # should be deprecated
-}
-_TIMM_TASKS_TO_MODEL_LOADERS = {
-    "image-classification": "create_model",
-}
-
-
-_LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP = {
-    "timm": _TIMM_TASKS_TO_MODEL_LOADERS,
-    "diffusers": _DIFFUSERS_TASKS_TO_MODEL_LOADERS,
-    "transformers": _TRANSFORMERS_TASKS_TO_MODEL_LOADERS,
-}
+from .backends.diffusers_utils import (
+    TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES as DIFFUSERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES,
+)
+from .backends.diffusers_utils import (
+    get_diffusers_pretrained_config,
+)
+from .backends.timm_utils import get_timm_pretrained_config
+from .backends.transformers_utils import (
+    TASKS_TO_MODEL_LOADERS,
+    get_transformers_pretrained_config,
+)
+from .backends.transformers_utils import (
+    TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES as TRANSFORMERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES,
+)
 
 _SYNONYM_TASK_MAP = {
     "masked-lm": "fill-mask",
@@ -70,18 +35,12 @@
     "speech2seq-lm": "automatic-speech-recognition",
     "sequence-classification": "text-classification",
     "zero-shot-classification": "text-classification",
-    "causal-lm-with-past": "text-generation-with-past",
-    "default-with-past": "feature-extraction-with-past",
-    "seq2seq-lm-with-past": "text2text-generation-with-past",
-    "speech2seq-lm-with-past": "automatic-speech-recognition-with-past",
 }
 
 IMAGE_DIFFUSION_TASKS = [
     "inpainting",
     "text-to-image",
     "image-to-image",
-    "stable-diffusion",
-    "stable-diffusion-xl",
 ]
 
 TEXT_GENERATION_TASKS = [
@@ -93,7 +52,6 @@
 ]
 
 TEXT_EMBEDDING_TASKS = [
-    "fill-mask",
     "feature-extraction",
 ]
 
@@ -105,70 +63,104 @@ def map_from_synonym(task: str) -> str:
 
 
 def infer_library_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str:
-    is_local = os.path.isdir(model_name_or_path)
+    inferred_library_name = None
 
-    if is_local:
-        raise RuntimeError("Cannot infer the library from a local directory yet, please specify the library manually.")
+    if huggingface_hub.repo_exists(model_name_or_path):
+        model_info = huggingface_hub.model_info(model_name_or_path, revision=revision)
+        inferred_library_name = getattr(model_info, "library_name", None)
 
-    model_info = huggingface_hub.model_info(model_name_or_path, revision=revision)
+        if inferred_library_name == "sentence-transformers":
+            inferred_library_name = "transformers"
 
-    inferred_library_name = getattr(model_info, "library_name", None)
+        if inferred_library_name is None:
+            raise RuntimeError(f"Could not infer library name from repo {model_name_or_path}.")
 
-    if inferred_library_name is None:
-        raise KeyError(f"Could not find the proper library name for {model_name_or_path}.")
+    elif os.path.isdir(model_name_or_path):
+        local_files = os.listdir(model_name_or_path)
 
-    if inferred_library_name == "sentence-transformers":
-        inferred_library_name = "transformers"
+        if "model_index.json" in local_files:
+            inferred_library_name = "diffusers"
+        elif "config.json" in local_files:
+            config_dict = json.load(open(os.path.join(model_name_or_path, "config.json"), "r"))
+            if "pretrained_cfg" in config_dict or "architecture" in config_dict:
+                inferred_library_name = "timm"
+            elif "_diffusers_version" in config_dict:
+                inferred_library_name = "diffusers"
+            else:
+                inferred_library_name = "transformers"
 
-    return inferred_library_name
+        if inferred_library_name is None:
+            raise KeyError(f"Could not find the proper library name for directory {model_name_or_path}.")
 
+    else:
+        raise KeyError(
+            f"Could not find the proper library name for {model_name_or_path}"
+            " because it's neither a repo nor a directory."
+        )
 
-# adapted from https://github.com/huggingface/optimum/blob/main/optimum/exporters/tasks.py without torch dependency
-def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str:
-    is_local = os.path.isdir(model_name_or_path)
+    return inferred_library_name
 
-    if is_local:
-        raise RuntimeError("Cannot infer the task from a local directory yet, please specify the task manually.")
 
-    model_info = huggingface_hub.model_info(model_name_or_path, revision=revision)
+def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str:
     library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision)
 
+    inferred_task_name = None
+
     if library_name == "timm":
         inferred_task_name = "image-classification"
-
     elif library_name == "sentence-transformers":
         inferred_task_name = "feature-extraction"
-
-    elif library_name == "diffusers":
-        if "text-to-image" in model_info.tags:
-            inferred_task_name = "text-to-image"
-        elif "image-to-image" in model_info.tags:
-            inferred_task_name = "image-to-image"
-        elif "inpainting" in model_info.tags:
-            inferred_task_name = "inpainting"
-        else:
-            class_name = model_info.config["diffusers"]["class_name"]
-            inferred_task_name = "stable-diffusion-xl" if "XL" in class_name else "stable-diffusion"
-
-    elif library_name == "transformers":
-        if model_info.pipeline_tag is not None:
-            inferred_task_name = map_from_synonym(model_info.pipeline_tag)
-        else:
-            pipeline_tag = model_info.transformersInfo.pipeline_tag
-
-            if model_info.transformers_info is not None and pipeline_tag is not None:
-                inferred_task_name = map_from_synonym(pipeline_tag)
+    elif huggingface_hub.repo_exists(model_name_or_path):
+        model_info = huggingface_hub.model_info(model_name_or_path, revision=revision)
+
+        if library_name == "diffusers":
+            if model_info.pipeline_tag is not None:
+                inferred_task_name = map_from_synonym(model_info.pipeline_tag)
+        elif library_name == "transformers":
+            if model_info.pipeline_tag is not None:
+                inferred_task_name = map_from_synonym(model_info.pipeline_tag)
             else:
-                auto_model_class_name = model_info.transformers_info["auto_model"]
-                tasks_to_automodels = _LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP[model_info.library_name]
-                for task_name, class_name_for_task in tasks_to_automodels.items():
-                    if class_name_for_task == auto_model_class_name:
+                if model_info.transformers_info is not None and model_info.transformersInfo.pipeline_tag is not None:
+                    inferred_task_name = map_from_synonym(model_info.transformersInfo.pipeline_tag)
+                else:
+                    auto_model_class_name = model_info.transformers_info["auto_model"]
+                    for task_name, model_loaders in TASKS_TO_MODEL_LOADERS.items():
+                        if isinstance(model_loaders, str):
+                            model_loaders = (model_loaders,)
+                        for model_loader in model_loaders:
+                            if auto_model_class_name == model_loader:
+                                inferred_task_name = task_name
+                                break
+                        if inferred_task_name is not None:
+                            break
+    elif os.path.isdir(model_name_or_path):
+        if library_name == "diffusers":
+            diffusers_config = get_diffusers_pretrained_config(model_name_or_path, revision=revision)
+            class_name = diffusers_config["_class_name"]
+
+            for task_name, model_mapping in DIFFUSERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES.items():
+                for model_type, model_class_name in model_mapping.items():
+                    if class_name == model_class_name:
                         inferred_task_name = task_name
                         break
-                    inferred_task_name = None
-
-    else:
-        raise NotImplementedError(f"Library {library_name} is not supported yet.")
+                if inferred_task_name is not None:
+                    break
+        elif library_name == "transformers":
+            auto_modeling_module = importlib.import_module("transformers.models.auto.modeling_auto")
+            transformers_config = get_transformers_pretrained_config(model_name_or_path, revision=revision)
+            model_type = transformers_config.model_type
+
+            for task_name, model_loaders in TRANSFORMERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES.items():
+                if isinstance(model_loaders, str):
+                    model_loaders = (model_loaders,)
+                for model_loader in model_loaders:
+                    model_loader_class = getattr(auto_modeling_module, model_loader)
+                    model_mapping = model_loader_class._model_mapping._model_mapping
+                    if model_type in model_mapping:
+                        inferred_task_name = task_name
+                        break
+                if inferred_task_name is not None:
+                    break
 
     if inferred_task_name is None:
         raise KeyError(f"Could not find the proper task name for {auto_model_class_name}.")
@@ -176,52 +168,36 @@ def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Option
     return inferred_task_name
 
 
-# adapted from https://github.com/huggingface/optimum/blob/main/optimum/exporters/tasks.py without torch dependency
-def get_automodel_class_for_task(
-    task: str,
-    auto_model_class_name: Optional[str] = None,
-    model_type: Optional[str] = None,
-    library: str = "transformers",
-    framework: str = "pt",
-):
-    task = map_from_synonym(task)
-
-    if framework == "pt":
-        tasks_to_model_loader = _LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP[library]
-    elif framework == "jax":
-        raise NotImplementedError("JAX is not supported yet.")
-    elif framework == "tf":
-        raise NotImplementedError("TensorFlow is not supported yet.")
-    else:
-        raise NotImplementedError("Only PyTorch is supported for now.")
-
-    loaded_library = importlib.import_module(library)
-
-    if auto_model_class_name is None:
-        if task not in tasks_to_model_loader:
-            raise KeyError(
-                f"Unknown task: {task}. Possible values are: "
-                + ", ".join([f"`{key}` for {tasks_to_model_loader[key]}" for key in tasks_to_model_loader])
-            )
-
-        if isinstance(tasks_to_model_loader[task], str):
-            inferred_auto_model_class_name = tasks_to_model_loader[task]
-        elif isinstance(tasks_to_model_loader[task], tuple):
-            if model_type is None:
-                inferred_auto_model_class_name = tasks_to_model_loader[task][0]
-            else:
-                for auto_class_name in tasks_to_model_loader[task]:
-                    model_mapping = getattr(loaded_library, auto_class_name)._model_mapping._model_mapping
+def infer_model_type_from_model_name_or_path(model_name_or_path: str, revision: Optional[str] = None) -> str:
+    library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision)
 
-                    if model_type in model_mapping or model_type.replace("-", "_") in model_mapping:
-                        inferred_auto_model_class_name = auto_class_name
-                        break
+    inferred_model_type = None
 
-                    inferred_auto_model_class_name = None
+    if library_name == "timm":
+        timm_config = get_timm_pretrained_config(model_name_or_path)
+        inferred_model_type = timm_config.architecture
+
+    elif library_name == "diffusers":
+        from diffusers import DiffusionPipeline
+
+        config = DiffusionPipeline.load_config(model_name_or_path)
+        config, _ = config if isinstance(config, tuple) else (config, None)
+        class_name = config["_class_name"]
+
+        for task_name, model_mapping in DIFFUSERS_TASKS_TO_MODEL_TYPES_TO_MODEL_CLASSES.items():
+            for model_type, model_class_name in model_mapping.items():
+                if model_class_name == class_name:
+                    inferred_model_type = model_type
+                    break
+            if inferred_model_type is not None:
+                break
+    else:
+        from transformers import AutoConfig
 
-    if inferred_auto_model_class_name is None:
-        raise ValueError(f"Could not find the model class name for task {task}.")
+        config = AutoConfig.from_pretrained(model_name_or_path)
+        inferred_model_type = config.model_type
 
-    inferred_model_class = getattr(loaded_library, inferred_auto_model_class_name)
+    if inferred_model_type is None:
+        raise KeyError(f"Could not find the proper model type for {model_name_or_path}.")
 
-    return inferred_model_class
+    return inferred_model_type
diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
index 6b8d614f..c130d162 100644
--- a/optimum_benchmark/trackers/latency.py
+++ b/optimum_benchmark/trackers/latency.py
@@ -44,9 +44,6 @@ def __getitem__(self, index) -> float:
             raise ValueError(f"Invalid index type: {type(index)}, expected int or slice")
 
     def __sub__(self, latency: "Latency") -> "Latency":
-        if not isinstance(latency, Latency):
-            raise ValueError(f"Cannot subtract {type(latency)} from Latency")
-
         latencies = [lat - latency.mean for lat in self.values]
 
         assert not any(latency < 0 for latency in latencies), "Negative latency detected"
@@ -82,14 +79,14 @@ def from_values(values: List[float], unit: str) -> "Latency":
     def log(self, prefix: str = "method"):
         stdev_percentage = 100 * self.stdev / self.mean if self.mean > 0 else 0
         LOGGER.info(f"\t\t+ {prefix} latency:")
-        LOGGER.info(f"\t\t\t+ count: {self.count}")
-        LOGGER.info(f"\t\t\t+ total: {self.total:f} {self.unit}")
-        LOGGER.info(f"\t\t\t+ mean: {self.mean:f} {self.unit}")
-        LOGGER.info(f"\t\t\t+ stdev: {self.stdev:f} {self.unit} ({stdev_percentage:.2f}%)")
-        LOGGER.info(f"\t\t\t+ p50: {self.p50:f} {self.unit}")
-        LOGGER.info(f"\t\t\t+ p90: {self.p90:f} {self.unit}")
-        LOGGER.info(f"\t\t\t+ p95: {self.p95:f} {self.unit}")
-        LOGGER.info(f"\t\t\t+ p99: {self.p99:f} {self.unit}")
+        LOGGER.info(f"\t\t\t- count: {self.count}")
+        LOGGER.info(f"\t\t\t- total: {self.total:f} {self.unit}")
+        LOGGER.info(f"\t\t\t- mean: {self.mean:f} {self.unit}")
+        LOGGER.info(f"\t\t\t- stdev: {self.stdev:f} {self.unit} ({stdev_percentage:.2f}%)")
+        LOGGER.info(f"\t\t\t- p50: {self.p50:f} {self.unit}")
+        LOGGER.info(f"\t\t\t- p90: {self.p90:f} {self.unit}")
+        LOGGER.info(f"\t\t\t- p95: {self.p95:f} {self.unit}")
+        LOGGER.info(f"\t\t\t- p99: {self.p99:f} {self.unit}")
 
 
 @dataclass
diff --git a/tests/configs/_bert_.yaml b/tests/configs/_bert_.yaml
index a9b5a38a..e54d2925 100644
--- a/tests/configs/_bert_.yaml
+++ b/tests/configs/_bert_.yaml
@@ -1,2 +1,3 @@
 backend:
   model: google-bert/bert-base-uncased
+  task: feature-extraction
diff --git a/tests/configs/_diffusers_.yaml b/tests/configs/_diffusers_.yaml
index 0f8e4d27..607b2502 100644
--- a/tests/configs/_diffusers_.yaml
+++ b/tests/configs/_diffusers_.yaml
@@ -1,4 +1,4 @@
 backend:
   library: diffusers
-  task: stable-diffusion
+  task: text-to-image
   model: hf-internal-testing/tiny-stable-diffusion-torch
diff --git a/tests/configs/cuda_inference_py_txi_bert.yaml b/tests/configs/cuda_inference_py_txi_bert.yaml
index 68c726c5..62405f30 100644
--- a/tests/configs/cuda_inference_py_txi_bert.yaml
+++ b/tests/configs/cuda_inference_py_txi_bert.yaml
@@ -3,7 +3,7 @@ defaults:
   - _base_ # inherits from base config
   - _cuda_ # inherits from cuda config
   - _inference_ # inherits from inference config
-  - _bert_ # inherits from gpt config
+  - _bert_ # inherits from bert config
   - _self_ # hydra 1.1 compatibility
   - override backend: py-txi
 
diff --git a/tests/test_api.py b/tests/test_api.py
index c54fb075..56a44079 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -37,7 +37,7 @@
     ("transformers", "text-classification", "FacebookAI/roberta-base"),
     ("transformers", "token-classification", "microsoft/deberta-v3-base"),
     ("transformers", "image-classification", "google/vit-base-patch16-224"),
-    ("diffusers", "stable-diffusion", "CompVis/stable-diffusion-v1-4"),
+    ("diffusers", "text-to-image", "CompVis/stable-diffusion-v1-4"),
 ]
 
 

From 6351e36d37f4bc6aba5b9a4e7bac79b79cc14838 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 15 Jul 2024 17:15:44 +0200
Subject: [PATCH 5/6] Update readme (#228)

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 083b5221..780b4fbb 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,8 @@ Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices
 *News* 📰
 
 - 🥳 PyPI package is now available for installation: `pip install optimum-benchmark` 🎉 [check it out](https://pypi.org/project/optimum-benchmark/) !
-- numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs 🧠
+- Model loading latency/memory/energy tracking for all backends in the inference scenario 🚀
+- numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs.
 - 4 minimal docker images (`cpu`, `cuda`, `rocm`, `cuda-ort`) in [packages](https://github.com/huggingface/optimum-benchmark/pkgs/container/optimum-benchmark) for testing, benchmarking and reproducibility 🐳
 - vLLM backend for benchmarking [vLLM](https://github.com/vllm-project/vllm)'s inference engine 🚀
 - Hosting the codebase of the [LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) 🥇

From 9337b863a5d9e4991ca83847e6c2f66a543b52d9 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:22:15 +0200
Subject: [PATCH 6/6] Update vllm backend to support offline and online serving
 modes (#232)

---
 ...aml => test_cli_cuda_vllm_single_gpu.yaml} |   2 +-
 examples/vllm_llama.yaml                      |   7 +-
 optimum_benchmark/backends/config.py          |  22 +---
 optimum_benchmark/backends/pytorch/config.py  |   6 +
 .../backends/transformers_utils.py            |   8 +-
 optimum_benchmark/backends/vllm/backend.py    | 112 ++++++++----------
 optimum_benchmark/backends/vllm/config.py     |  66 +++++++----
 tests/configs/_serving_mode_.yaml             |   5 +
 tests/configs/cuda_inference_vllm_bloom.yaml  |   2 +
 tests/test_cli.py                             |   8 +-
 10 files changed, 131 insertions(+), 107 deletions(-)
 rename .github/workflows/{test_cli_cuda_vllm.yaml => test_cli_cuda_vllm_single_gpu.yaml} (94%)
 create mode 100644 tests/configs/_serving_mode_.yaml

diff --git a/.github/workflows/test_cli_cuda_vllm.yaml b/.github/workflows/test_cli_cuda_vllm_single_gpu.yaml
similarity index 94%
rename from .github/workflows/test_cli_cuda_vllm.yaml
rename to .github/workflows/test_cli_cuda_vllm_single_gpu.yaml
index d4f9042d..66ce017a 100644
--- a/.github/workflows/test_cli_cuda_vllm.yaml
+++ b/.github/workflows/test_cli_cuda_vllm_single_gpu.yaml
@@ -50,4 +50,4 @@ jobs:
           run: |
             pip install packaging
             pip install -e .[testing,vllm,flash-attn]
-            pytest -x -s -k "cli and cuda and vllm"
+            FORCE_SERIAL=1 pytest -x -s -k "cli and cuda and vllm"
diff --git a/examples/vllm_llama.yaml b/examples/vllm_llama.yaml
index 0700dd3b..8bbb4025 100644
--- a/examples/vllm_llama.yaml
+++ b/examples/vllm_llama.yaml
@@ -14,9 +14,12 @@ launcher:
 
 backend:
   device: cuda
-  device_ids: 2
-  no_weights: true
+  device_ids: 0
+  no_weights: false
+  serving_mode: offline
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  engine_args:
+    enforce_eager: true
 
 scenario:
   input_shapes:
diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
index ba09267b..8be19c3d 100644
--- a/optimum_benchmark/backends/config.py
+++ b/optimum_benchmark/backends/config.py
@@ -30,10 +30,10 @@ class BackendConfig(ABC):
     processor: Optional[str] = None
 
     device: Optional[str] = None
-    device_ids: Optional[str] = None
-    # yes we use a string here instead of a list
+    # we use a string here instead of a list
     # because it's easier to pass in a yaml or from cli
     # and it's consistent with GPU environment variables
+    device_ids: Optional[str] = None
 
     seed: int = 42
     inter_op_num_threads: Optional[int] = None
@@ -44,9 +44,6 @@ class BackendConfig(ABC):
     # processor kwargs that are added to its init method/constructor
     processor_kwargs: Dict[str, Any] = field(default_factory=dict)
 
-    # deprecated
-    hub_kwargs: Dict[str, Any] = field(default_factory=dict)
-
     def __post_init__(self):
         if self.model is None:
             raise ValueError("`model` must be specified.")
@@ -54,23 +51,16 @@ def __post_init__(self):
         if self.processor is None:
             self.processor = self.model
 
-        if self.hub_kwargs:
-            LOGGER.warning(
-                "`hub_kwargs` is deprecated and will be removed in future versions."
-                "Please use `model_kwargs` and `processor_kwargs` seperately."
-            )
-            self.model_kwargs = {**self.model_kwargs, **self.hub_kwargs}
-            self.processor_kwargs = {**self.processor_kwargs, **self.hub_kwargs}
-
+        # TODO: add cache_dir, token, etc. to these methods
         if self.task is None:
-            self.task = infer_task_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None))
+            self.task = infer_task_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None))
 
         if self.library is None:
-            self.library = infer_library_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None))
+            self.library = infer_library_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None))
 
         if self.model_type is None:
             self.model_type = infer_model_type_from_model_name_or_path(
-                self.model, self.hub_kwargs.get("revision", None)
+                self.model, self.model_kwargs.get("revision", None)
             )
 
         if self.device is None:
diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py
index a519fa2f..225718e5 100644
--- a/optimum_benchmark/backends/pytorch/config.py
+++ b/optimum_benchmark/backends/pytorch/config.py
@@ -54,6 +54,12 @@ class PyTorchConfig(BackendConfig):
     def __post_init__(self):
         super().__post_init__()
 
+        if self.model_kwargs.get("torch_dtype", None) is not None:
+            raise ValueError(
+                "`torch_dtype` is an explicit argument in the PyTorch backend config. "
+                "Please remove it from the `model_kwargs` and set it in the backend config directly."
+            )
+
         if self.device_map is not None and self.device_map not in DEVICE_MAPS:
             raise ValueError(f"`device_map` must be one of {DEVICE_MAPS}. Got {self.device_map} instead.")
 
diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 87755e78..3781da46 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -1,3 +1,4 @@
+import warnings
 from contextlib import contextmanager
 from typing import Any, Dict, Optional, Union
 
@@ -107,7 +108,12 @@ def extract_transformers_shapes_from_artifacts(
         processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None}
         artifacts_dict.update(processor_dict)
     elif processor is not None:
-        processor_dict = {k: getattr(processor, k) for k in dir(processor) if isinstance(getattr(processor, k), int)}
+        try:
+            processor_dict = {
+                k: getattr(processor, k) for k in dir(processor) if isinstance(getattr(processor, k), int)
+            }
+        except Exception:
+            warnings.warn(f"Could not extract shapes from processor {processor}")
 
     shapes = {}
 
diff --git a/optimum_benchmark/backends/vllm/backend.py b/optimum_benchmark/backends/vllm/backend.py
index 1a28de4a..e90f3e7e 100644
--- a/optimum_benchmark/backends/vllm/backend.py
+++ b/optimum_benchmark/backends/vllm/backend.py
@@ -1,11 +1,12 @@
+import asyncio
 import os
 from tempfile import TemporaryDirectory
-from typing import Any, Dict
+from typing import Any, Dict, Union
 
 import torch
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from safetensors.torch import save_file
-from vllm import LLM, SamplingParams
+from vllm import AsyncEngineArgs, AsyncLLMEngine, EngineArgs, LLMEngine, SamplingParams
 
 from ...task_utils import TEXT_GENERATION_TASKS
 from ..base import Backend
@@ -15,6 +16,7 @@
 
 class VLLMBackend(Backend[VLLMConfig]):
     NAME: str = "vllm"
+    pretrained_model: Union[LLMEngine, AsyncLLMEngine]
 
     def __init__(self, config: VLLMConfig) -> None:
         super().__init__(config)
@@ -97,34 +99,10 @@ def load_model_with_no_weights(self) -> None:
         self.config.model = original_model
 
     def load_model_from_pretrained(self) -> None:
-        self.pretrained_model = LLM(
-            model=self.config.model,
-            # tokenizer
-            tokenizer=self.config.processor,
-            tokenizer_mode=self.config.tokenizer_mode,
-            skip_tokenizer_init=self.config.skip_tokenizer_init,
-            # device
-            device=self.config.device,
-            # parallelism
-            tensor_parallel_size=self.config.tensor_parallel_size,
-            # precision
-            quantization=self.config.quantization,
-            dtype=self.config.dtype,
-            # memory
-            swap_space=self.config.swap_space,
-            gpu_memory_utilization=self.config.gpu_memory_utilization,
-            # cuda graphs
-            enforce_eager=self.config.enforce_eager,
-            max_context_len_to_capture=self.config.max_context_len_to_capture,
-            max_seq_len_to_capture=self.config.max_seq_len_to_capture,
-            # kernels
-            disable_custom_all_reduce=self.config.disable_custom_all_reduce,
-            # additional stuff
-            trust_remote_code=self.config.model_kwargs.get("trust_remote_code", False),
-            tokenizer_revision=self.config.processor_kwargs.get("revision", None),
-            revision=self.config.model_kwargs.get("revision", None),
-            seed=self.config.seed,
-        )
+        if self.config.serving_mode == "offline":
+            self.pretrained_model = LLMEngine.from_engine_args(EngineArgs(**self.config.to_engine_args()))
+        else:
+            self.pretrained_model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**self.config.to_engine_args()))
 
     def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if self.config.task in TEXT_GENERATION_TASKS:
@@ -134,11 +112,31 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
 
         return inputs
 
-    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
-        return self.pretrained_model.generate(
-            **inputs,
-            use_tqdm=False,
-            sampling_params=SamplingParams(
+    def batch_offline_engine_generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
+        for i, prompt in enumerate(inputs["prompts"]):
+            self.pretrained_model.add_request(
+                inputs=prompt,
+                request_id=str(i),
+                params=SamplingParams(
+                    ignore_eos=True,
+                    detokenize=True,
+                    seed=self.config.seed,
+                    n=kwargs.get("num_return_sequences"),
+                    max_tokens=kwargs.get("max_new_tokens"),
+                    min_tokens=kwargs.get("min_new_tokens"),
+                    use_beam_search=kwargs.get("num_beams") > 1,
+                    logits_processors=kwargs.get("logits_processors", None),
+                ),
+            )
+
+        while self.pretrained_model.has_unfinished_requests():
+            self.pretrained_model.step()
+
+    async def single_online_engine_generate(self, prompt: str, request_id: str, kwargs: Dict[str, Any]) -> Any:
+        stream = await self.pretrained_model.add_request(
+            inputs=prompt,
+            request_id=request_id,
+            params=SamplingParams(
                 ignore_eos=True,
                 detokenize=True,
                 seed=self.config.seed,
@@ -150,33 +148,23 @@ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
             ),
         )
 
+        async for _ in stream:
+            pass
+
+    async def batch_online_engine_generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
+        tasks = [
+            self.single_online_engine_generate(prompt, str(i), kwargs) for i, prompt in enumerate(inputs["prompts"])
+        ]
+        await asyncio.gather(*tasks)
+
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Dict[str, Any]:
-        return self.pretrained_model.generate(
-            **inputs,
-            use_tqdm=False,
-            sampling_params=SamplingParams(
-                ignore_eos=True,
-                detokenize=True,
-                seed=self.config.seed,
-                n=kwargs.get("num_return_sequences"),
-                max_tokens=kwargs.get("max_new_tokens"),
-                min_tokens=kwargs.get("min_new_tokens"),
-                use_beam_search=kwargs.get("num_beams") > 1,
-                logits_processors=kwargs.get("logits_processors", None),
-            ),
-        )
+        if self.config.serving_mode == "offline":
+            self.batch_offline_engine_generate(inputs, kwargs)
+        else:
+            asyncio.run(self.batch_online_engine_generate(inputs, kwargs))
 
     def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
-        return self.pretrained_model.generate(
-            **inputs,
-            use_tqdm=False,
-            sampling_params=SamplingParams(
-                ignore_eos=True,
-                detokenize=True,
-                n=kwargs.get("num_return_sequences"),
-                max_tokens=kwargs.get("max_new_tokens"),
-                min_tokens=kwargs.get("min_new_tokens"),
-                use_beam_search=kwargs.get("num_beams") > 1,
-                logits_processors=kwargs.get("logits_processors", None),
-            ),
-        )
+        if self.config.serving_mode == "offline":
+            self.batch_offline_engine_generate(inputs, kwargs)
+        else:
+            asyncio.run(self.batch_online_engine_generate(inputs, kwargs))
diff --git a/optimum_benchmark/backends/vllm/config.py b/optimum_benchmark/backends/vllm/config.py
index 59cb859c..44bd9428 100644
--- a/optimum_benchmark/backends/vllm/config.py
+++ b/optimum_benchmark/backends/vllm/config.py
@@ -1,5 +1,5 @@
-from dataclasses import dataclass
-from typing import Optional
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
 
 from ...import_utils import vllm_version
 from ..config import BackendConfig
@@ -11,36 +11,54 @@ class VLLMConfig(BackendConfig):
     version: Optional[str] = vllm_version()
     _target_: str = "optimum_benchmark.backends.vllm.backend.VLLMBackend"
 
-    # optimum-benchmark
+    # creates a model from scratch with dummy weights
     no_weights: bool = False
 
-    # tokenizer
-    tokenizer_mode: str = "auto"
-    skip_tokenizer_init: bool = False
+    # decides whether to use the offline or online llm engine
+    serving_mode: str = "online"
 
-    # parallelism
-    tensor_parallel_size: int = 1
+    # passed to EngineArgs
+    engine_args: Dict[str, Any] = field(default_factory=dict)
 
-    # precision
-    dtype: str = "auto"
-    quantization: Optional[str] = None
+    def __post_init__(self):
+        # duplicates that are handled by the backend config directly
+        if "model" in self.engine_args:
+            raise ValueError("model should not be passed in `backend.engine_args`, use `backend.model` instead")
 
-    # cuda graphs
-    enforce_eager: bool = False
-    max_context_len_to_capture: Optional[int] = None
-    max_seq_len_to_capture: int = 8192
+        if "tokenizer" in self.engine_args:
+            raise ValueError("tokenizer should not be passed in `backend.engine_args`, use `backend.processor` instead")
 
-    # kernels
-    disable_custom_all_reduce: bool = False
+        if "device" in self.engine_args:
+            raise ValueError("device should not be passed in `backend.engine_args`, use `backend.device` instead")
 
-    # memory
-    gpu_memory_utilization: float = 0.9
-    swap_space: int = 4
+        if self.serving_mode not in ["offline", "online"]:
+            raise ValueError(f"Invalid serving_mode: {self.serving_mode}. Must be 'online' or 'offline'.")
+
+        # needed for task/library/model_type inference
+        self.model_kwargs = {
+            "revision": self.engine_args.get("revision", "main"),
+            "trust_remote_code": self.engine_args.get("trust_remote_code", False),
+            **self.model_kwargs,
+        }
+        self.processor_kwargs = {
+            "revision": self.engine_args.get("tokenizer_revision", "main"),
+            "trust_remote_code": self.engine_args.get("trust_remote_code", False),
+            **self.processor_kwargs,
+        }
 
-    def __post_init__(self):
         super().__post_init__()
 
-        self.device = self.device.lower()
+        if self.engine_args.get("disable_log_stats", None) is None:
+            self.engine_args["disable_log_stats"] = True
+
+        if self.serving_mode == "online":
+            if self.engine_args.get("disable_log_requests", None) is None:
+                self.engine_args["disable_log_requests"] = True
 
-        if self.device not in ["cuda", "neuron", "cpu"]:
-            raise ValueError(f"VLLM Backend only supports 'cpu', 'cuda' and 'neuron' devices, got {self.device}")
+    def to_engine_args(self) -> Dict[str, Any]:
+        return dict(
+            model=self.model,
+            tokenizer=self.processor,
+            device=self.device,
+            **self.engine_args,
+        )
diff --git a/tests/configs/_serving_mode_.yaml b/tests/configs/_serving_mode_.yaml
new file mode 100644
index 00000000..4b7523a0
--- /dev/null
+++ b/tests/configs/_serving_mode_.yaml
@@ -0,0 +1,5 @@
+hydra:
+  mode: MULTIRUN
+  sweeper:
+    params:
+      backend.serving_mode: online,offline
diff --git a/tests/configs/cuda_inference_vllm_bloom.yaml b/tests/configs/cuda_inference_vllm_bloom.yaml
index 9c1cb304..ba9d92af 100644
--- a/tests/configs/cuda_inference_vllm_bloom.yaml
+++ b/tests/configs/cuda_inference_vllm_bloom.yaml
@@ -3,6 +3,8 @@ defaults:
   - _base_ # inherits from base config
   - _cuda_ # inherits from cuda config
   - _inference_ # inherits from inference config
+  - _serving_mode_ # inherits from serving_mode config
+  - _no_weights_ # inherits from no weights config
   - _bloom_ # inherits from bloom config
   - _self_ # hydra 1.1 compatibility
   - override backend: vllm
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 156ec5a1..806eedfa 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -8,6 +8,7 @@
 LOGGER = getLogger("test")
 
 
+FORCE_SERIAL = os.environ.get("FORCE_SERIAL", "0") == "1"
 TEST_CONFIG_DIR = "/".join(__file__.split("/")[:-1] + ["configs"])
 TEST_CONFIG_NAMES = [
     config.split(".")[0]
@@ -24,12 +25,17 @@ def test_cli_configs(config_name):
         TEST_CONFIG_DIR,
         "--config-name",
         config_name,
-        # to run the tests faster (comment for debugging)
+        # to run the tests faster
         "hydra/launcher=joblib",
         "hydra.launcher.batch_size=1",
         "hydra.launcher.prefer=threads",
     ]
 
+    if FORCE_SERIAL:
+        args += ["hydra.launcher.n_jobs=1"]
+    else:
+        args += ["hydra.launcher.n_jobs=-1"]
+
     popen = run_subprocess_and_log_stream_output(LOGGER, args)
     assert popen.returncode == 0, f"Failed to run {config_name}"