From 92a5cea4dd0c750c1de59504f213068dc3bba59c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-90-1-217.ec2.internal>
Date: Wed, 21 Aug 2024 13:17:29 +0000
Subject: [PATCH 01/73] add intel pytorch ort and openvino to leaderboard

---
 .gitignore                                    |   5 +
 .../cuda}/update_llm_perf_cuda_pytorch.py     |   9 +-
 .../intel/update_llm_perf_intel_openvino.py   | 179 +++++++++++++++++
 .../intel/update_llm_perf_intel_ort.py        | 182 ++++++++++++++++++
 .../intel/update_llm_perf_intel_pytorch.py    | 182 ++++++++++++++++++
 llm_perf/utils.py                             |  10 +-
 6 files changed, 560 insertions(+), 7 deletions(-)
 rename llm_perf/{ => hardware/cuda}/update_llm_perf_cuda_pytorch.py (97%)
 create mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_openvino.py
 create mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_ort.py
 create mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py

diff --git a/.gitignore b/.gitignore
index f26fda31e..31f9b57f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,6 +173,11 @@ experiments/
 amdsmi/
 amd-*
 
+# Code carbon
+generate_codecarbon.json
+task_codecarbon.json
+prefill_codecarbon.json
+
 # Mac specific
 .DS_Store
 outputs/
\ No newline at end of file
diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py
similarity index 97%
rename from llm_perf/update_llm_perf_cuda_pytorch.py
rename to llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py
index 51ab678f3..8b65e1f5c 100644
--- a/llm_perf/update_llm_perf_cuda_pytorch.py
+++ b/llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py
@@ -17,6 +17,7 @@
 
 SUBSET = os.getenv("SUBSET", None)
 MACHINE = os.getenv("MACHINE", None)
+HARDWARE = "cuda"
 
 
 if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
@@ -28,7 +29,7 @@
 else:
     raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
 
-ATTENTION_COFIGS = ["eager", "sdpa", "flash_attention_2"]
+ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"]
 if SUBSET == "unquantized":
     WEIGHTS_CONFIGS = {
         # unquantized
@@ -104,7 +105,7 @@ def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
     quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
     quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
 
-    if not is_benchmark_supported(weights_config, attn_implementation):
+    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
         LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
         return
 
@@ -167,13 +168,13 @@ def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
     setup_logging(level="INFO", prefix="MAIN-PROCESS")
 
     models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_COFIGS, WEIGHTS_CONFIGS.keys())
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
     )
 
     LOGGER.info(
         f"Running a total of {len(models_attentions_weights)} benchmarks, "
         f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_COFIGS)} attentions implementations "
+        f"{len(ATTENTION_CONFIGS)} attentions implementations "
         f"and {len(WEIGHTS_CONFIGS)} weights configurations."
     )
 
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py
new file mode 100644
index 000000000..1aa6052a9
--- /dev/null
+++ b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py
@@ -0,0 +1,179 @@
+import os
+import traceback
+from itertools import product
+from logging import getLogger
+
+from llm_perf.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    GENERATE_KWARGS,
+    INPUT_SHAPES,
+    OPEN_LLM_LIST,
+    PRETRAINED_OPEN_LLM_LIST,
+    is_benchmark_conducted,
+    is_benchmark_supported,
+)
+from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, OVConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+SUBSET = os.getenv("SUBSET", None)
+MACHINE = os.getenv("MACHINE", None)
+HARDWARE = "intel"
+
+
+if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
+    PUSH_REPO_ID = "optimum-benchmark/llm-perf-openvino-intel-debug"
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
+    SUBSET = "unquantized"
+elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-openvino-intel-{SUBSET}-{MACHINE}"
+else:
+    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
+
+ATTENTION_CONFIGS = ["eager", "sdpa"]
+if SUBSET == "unquantized":
+    WEIGHTS_CONFIGS = {
+        # unquantized
+        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+    }
+elif SUBSET == "bnb":
+    WEIGHTS_CONFIGS = {
+        # bnb
+        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
+        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
+    }
+elif SUBSET == "gptq":
+    WEIGHTS_CONFIGS = {
+        # gptq
+        "4bit-gptq-exllama-v1": {
+            "quant_scheme": "gptq",
+            "torch_dtype": "float16",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
+        },
+        "4bit-gptq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "gptq",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
+        },
+    }
+elif SUBSET == "awq":
+    WEIGHTS_CONFIGS = {
+        # awq
+        "4bit-awq-gemm": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemm"},
+        },
+        "4bit-awq-gemv": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemv"},
+        },
+        "4bit-awq-exllama-v1": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+        "4bit-awq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+    }
+
+
+LOGGER = getLogger("llm-perf-backend")
+LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
+LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
+LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
+
+
+def benchmark_intel_openvino(model, attn_implementation, weights_config):
+    benchmark_name = f"{weights_config}-{attn_implementation}"
+    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
+    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
+    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
+
+    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
+        return
+
+    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
+        return
+
+    launcher_config = ProcessConfig()
+    scenario_config = InferenceConfig(
+        memory=True,
+        energy=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes=INPUT_SHAPES,
+        generate_kwargs=GENERATE_KWARGS,
+    )
+    backend_config = OVConfig(
+        model=model,
+        device="cpu",
+        device_ids="0",
+        no_weights=True,
+        library="transformers",
+        task="text-generation",
+        quantization_config=quant_config,
+        model_kwargs={"trust_remote_code": True},
+    )
+
+    benchmark_config = BenchmarkConfig(
+        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
+    )
+
+    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    try:
+        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
+        benchmark_report = Benchmark.launch(benchmark_config)
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    except Exception:
+        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
+        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+
+if __name__ == "__main__":
+    # for isolated process
+    os.environ["LOG_TO_FILE"] = "0"
+    os.environ["LOG_LEVEL"] = "INFO"
+
+    # for main process
+    setup_logging(level="INFO", prefix="MAIN-PROCESS")
+
+    models_attentions_weights = list(
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
+    )
+
+    LOGGER.info(
+        f"Running a total of {len(models_attentions_weights)} benchmarks, "
+        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
+        f"{len(ATTENTION_CONFIGS)} attentions implementations "
+        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
+    )
+
+    for model, attn_implementation, weights_config in models_attentions_weights:
+        benchmark_intel_openvino(model, attn_implementation, weights_config)
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py
new file mode 100644
index 000000000..e7bb254c8
--- /dev/null
+++ b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py
@@ -0,0 +1,182 @@
+import os
+import traceback
+from itertools import product
+from logging import getLogger
+
+from llm_perf.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    GENERATE_KWARGS,
+    INPUT_SHAPES,
+    OPEN_LLM_LIST,
+    PRETRAINED_OPEN_LLM_LIST,
+    is_benchmark_conducted,
+    is_benchmark_supported,
+)
+from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, ORTConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+SUBSET = os.getenv("SUBSET", None)
+MACHINE = os.getenv("MACHINE", None)
+HARDWARE = "intel"
+
+
+if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
+    PUSH_REPO_ID = "optimum-benchmark/llm-perf-ort-intel-debug"
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
+    SUBSET = "unquantized"
+elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-ort-intel-{SUBSET}-{MACHINE}"
+else:
+    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
+
+ATTENTION_CONFIGS = ["eager", "sdpa"]
+if SUBSET == "unquantized":
+    WEIGHTS_CONFIGS = {
+        # unquantized
+        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+    }
+elif SUBSET == "bnb":
+    WEIGHTS_CONFIGS = {
+        # bnb
+        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
+        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
+    }
+elif SUBSET == "gptq":
+    WEIGHTS_CONFIGS = {
+        # gptq
+        "4bit-gptq-exllama-v1": {
+            "quant_scheme": "gptq",
+            "torch_dtype": "float16",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
+        },
+        "4bit-gptq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "gptq",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
+        },
+    }
+elif SUBSET == "awq":
+    WEIGHTS_CONFIGS = {
+        # awq
+        "4bit-awq-gemm": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemm"},
+        },
+        "4bit-awq-gemv": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemv"},
+        },
+        "4bit-awq-exllama-v1": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+        "4bit-awq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+    }
+
+
+LOGGER = getLogger("llm-perf-backend")
+LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
+LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
+LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
+
+
+def benchmark_intel_ort(model, attn_implementation, weights_config):
+    benchmark_name = f"{weights_config}-{attn_implementation}"
+    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
+    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
+    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
+
+    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
+        return
+
+    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
+        return
+
+    launcher_config = ProcessConfig()
+    scenario_config = InferenceConfig(
+        memory=True,
+        energy=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes=INPUT_SHAPES,
+        generate_kwargs=GENERATE_KWARGS,
+    )
+    backend_config = ORTConfig(
+        model=model,
+        device="cpu",
+        device_ids="0",
+        no_weights=True,
+        library="transformers",
+        task="text-generation",
+        torch_dtype=torch_dtype,
+        # quantization_scheme=quant_scheme,
+        quantization_config=quant_config,
+        # attn_implementation=attn_implementation,
+        model_kwargs={"trust_remote_code": True},
+    )
+
+    benchmark_config = BenchmarkConfig(
+        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
+    )
+
+    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    # try:
+    LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
+    benchmark_report = Benchmark.launch(benchmark_config)
+    benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+    benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+    benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    # except Exception:
+    #     LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
+    #     benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
+    #     benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+    #     benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+    #     benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+
+if __name__ == "__main__":
+    # for isolated process
+    os.environ["LOG_TO_FILE"] = "0"
+    os.environ["LOG_LEVEL"] = "INFO"
+
+    # for main process
+    setup_logging(level="INFO", prefix="MAIN-PROCESS")
+
+    models_attentions_weights = list(
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
+    )
+
+    LOGGER.info(
+        f"Running a total of {len(models_attentions_weights)} benchmarks, "
+        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
+        f"{len(ATTENTION_CONFIGS)} attentions implementations "
+        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
+    )
+
+    for model, attn_implementation, weights_config in models_attentions_weights:
+        benchmark_intel_ort(model, attn_implementation, weights_config)
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
new file mode 100644
index 000000000..bceb89d58
--- /dev/null
+++ b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
@@ -0,0 +1,182 @@
+import os
+import traceback
+from itertools import product
+from logging import getLogger
+
+from llm_perf.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    GENERATE_KWARGS,
+    INPUT_SHAPES,
+    OPEN_LLM_LIST,
+    PRETRAINED_OPEN_LLM_LIST,
+    is_benchmark_conducted,
+    is_benchmark_supported,
+)
+from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+SUBSET = os.getenv("SUBSET", None)
+MACHINE = os.getenv("MACHINE", None)
+HARDWARE = "intel"
+
+
+if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
+    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug"
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
+    SUBSET = "unquantized"
+elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}"
+else:
+    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
+
+ATTENTION_CONFIGS = ["eager", "sdpa"]
+if SUBSET == "unquantized":
+    WEIGHTS_CONFIGS = {
+        # unquantized
+        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+    }
+elif SUBSET == "bnb":
+    WEIGHTS_CONFIGS = {
+        # bnb
+        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
+        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
+    }
+elif SUBSET == "gptq":
+    WEIGHTS_CONFIGS = {
+        # gptq
+        "4bit-gptq-exllama-v1": {
+            "quant_scheme": "gptq",
+            "torch_dtype": "float16",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
+        },
+        "4bit-gptq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "gptq",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
+        },
+    }
+elif SUBSET == "awq":
+    WEIGHTS_CONFIGS = {
+        # awq
+        "4bit-awq-gemm": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemm"},
+        },
+        "4bit-awq-gemv": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemv"},
+        },
+        "4bit-awq-exllama-v1": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+        "4bit-awq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+    }
+
+
+LOGGER = getLogger("llm-perf-backend")
+LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
+LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
+LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
+
+
+def benchmark_intel_pytorch(model, attn_implementation, weights_config):
+    benchmark_name = f"{weights_config}-{attn_implementation}"
+    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
+    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
+    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
+
+    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
+        return
+
+    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
+        return
+
+    launcher_config = ProcessConfig()
+    scenario_config = InferenceConfig(
+        memory=True,
+        energy=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes=INPUT_SHAPES,
+        generate_kwargs=GENERATE_KWARGS,
+    )
+    backend_config = PyTorchConfig(
+        model=model,
+        device="cpu",
+        device_ids="0",
+        no_weights=True,
+        library="transformers",
+        task="text-generation",
+        torch_dtype=torch_dtype,
+        quantization_scheme=quant_scheme,
+        quantization_config=quant_config,
+        attn_implementation=attn_implementation,
+        model_kwargs={"trust_remote_code": True},
+    )
+
+    benchmark_config = BenchmarkConfig(
+        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
+    )
+
+    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    try:
+        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
+        benchmark_report = Benchmark.launch(benchmark_config)
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    except Exception:
+        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
+        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+
+if __name__ == "__main__":
+    # for isolated process
+    os.environ["LOG_TO_FILE"] = "0"
+    os.environ["LOG_LEVEL"] = "INFO"
+
+    # for main process
+    setup_logging(level="INFO", prefix="MAIN-PROCESS")
+
+    models_attentions_weights = list(
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
+    )
+
+    LOGGER.info(
+        f"Running a total of {len(models_attentions_weights)} benchmarks, "
+        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
+        f"{len(ATTENTION_CONFIGS)} attentions implementations "
+        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
+    )
+
+    for model, attn_implementation, weights_config in models_attentions_weights:
+        benchmark_intel_pytorch(model, attn_implementation, weights_config)
diff --git a/llm_perf/utils.py b/llm_perf/utils.py
index 6665536c7..ea25cdab3 100644
--- a/llm_perf/utils.py
+++ b/llm_perf/utils.py
@@ -137,8 +137,12 @@ def is_benchmark_conducted(push_repo_id, subfolder):
         return False
 
 
-def is_benchmark_supported(weights_config, attn_implementation):
-    if attn_implementation == "flash_attention_2" and weights_config == "float32":
-        return False
+def is_benchmark_supported(weights_config, attn_implementation, hardware):
+    if hardware == "cuda":
+        if attn_implementation == "flash_attention_2" and weights_config == "float32":
+            return False
+    elif hardware == "intel":
+        if attn_implementation == "flash_attention_2":
+            return False
 
     return True

From 01680638d9acab85c96eb879e63b76b6eb658b63 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Wed, 21 Aug 2024 14:19:58 +0000
Subject: [PATCH 02/73] add intel pytorch ort and openvino to leaderboard

---
 .github/workflows/update_llm_perf_cuda_pytorch.yaml       | 2 +-
 llm_perf/hardware/intel/update_llm_perf_intel_openvino.py | 4 +---
 llm_perf/hardware/intel/update_llm_perf_intel_ort.py      | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
index 0ab646ab9..567128e5c 100644
--- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
@@ -53,4 +53,4 @@ jobs:
             pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
-            python llm_perf/update_llm_perf_cuda_pytorch.py
+            python llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py
index 1aa6052a9..869598285 100644
--- a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py
+++ b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py
@@ -12,7 +12,7 @@
     is_benchmark_conducted,
     is_benchmark_supported,
 )
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, OVConfig
+from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, OVConfig, ProcessConfig
 from optimum_benchmark.logging_utils import setup_logging
 
 SUBSET = os.getenv("SUBSET", None)
@@ -101,8 +101,6 @@ def benchmark_intel_openvino(model, attn_implementation, weights_config):
     benchmark_name = f"{weights_config}-{attn_implementation}"
     subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
 
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
     quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
 
     if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py
index e7bb254c8..5b8f5c3eb 100644
--- a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py
+++ b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py
@@ -1,5 +1,4 @@
 import os
-import traceback
 from itertools import product
 from logging import getLogger
 
@@ -12,7 +11,7 @@
     is_benchmark_conducted,
     is_benchmark_supported,
 )
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, ORTConfig
+from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ORTConfig, ProcessConfig
 from optimum_benchmark.logging_utils import setup_logging
 
 SUBSET = os.getenv("SUBSET", None)
@@ -102,7 +101,6 @@ def benchmark_intel_ort(model, attn_implementation, weights_config):
     subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
 
     torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
     quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
 
     if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):

From 0bc416f1bfaef797f2f6ad1104e520daf6cf96c1 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Wed, 21 Aug 2024 16:22:27 +0200
Subject: [PATCH 03/73] Add support for intel in leaderboard

---
 .../update_llm_perf_intel_pytorch.yml         | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yml

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
new file mode 100644
index 000000000..b7254edb9
--- /dev/null
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -0,0 +1,55 @@
+name: Update LLM Perf Benchmarks - CUDA PyTorch
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+env:
+  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda
+
+jobs:
+  run_benchmarks:
+    strategy:
+      fail-fast: false
+      matrix:
+        subset: [unquantized, bnb, awq, gptq]
+        
+        machine: [
+          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
+        ]
+
+    runs-on: ${{ matrix.machine.runs-on }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Run benchmarks
+        uses: addnab/docker-run-action@v3
+        env:
+          SUBSET: ${{ matrix.subset }}
+          MACHINE: ${{ matrix.machine.name }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        with:
+          image: ${{ env.IMAGE }}
+          options: |
+            --rm
+            --gpus all
+            --shm-size 64G
+            --env SUBSET
+            --env MACHINE
+            --env HF_TOKEN
+            --env MKL_THREADING_LAYER=GNU
+            --env HF_HUB_ENABLE_HF_TRANSFER=1
+            --volume ${{ github.workspace }}:/workspace
+            --workdir /workspace
+          run: |
+            pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
+            pip install -U transformers huggingface_hub[hf_transfer]
+            pip install -e .
+            python llm_perf/hardware/intel/update_llm_perf_cuda_pytorch.py

From 85f62e653b362dcec0cb41140f2fa4b34f8acb2c Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Wed, 21 Aug 2024 16:29:12 +0200
Subject: [PATCH 04/73] Update update_llm_perf_intel_pytorch.yml

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index b7254edb9..4d8413c49 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -1,6 +1,8 @@
 name: Update LLM Perf Benchmarks - CUDA PyTorch
 
 on:
+  pull_request:
+  push:
   workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"

From 7151e01b798314164c5902046e38abdfd07ba3eb Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Wed, 21 Aug 2024 16:40:12 +0200
Subject: [PATCH 05/73] Update update_llm_perf_intel_pytorch.yml

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index 4d8413c49..b7254edb9 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -1,8 +1,6 @@
 name: Update LLM Perf Benchmarks - CUDA PyTorch
 
 on:
-  pull_request:
-  push:
   workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"

From c92f818f7c26a1bddae78adfc92e3827a85b2eb3 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 22 Aug 2024 07:05:49 +0000
Subject: [PATCH 06/73] add new llm_perf_tests

---
 .../update_llm_perf_intel_pytorch.yaml        | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml
new file mode 100644
index 000000000..2eb8b906c
--- /dev/null
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml
@@ -0,0 +1,55 @@
+name: Update LLM Perf Benchmarks - Intel PyTorch
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+env:
+  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda
+
+jobs:
+  run_benchmarks:
+    strategy:
+      fail-fast: false
+      matrix:
+        subset: [unquantized, bnb, awq, gptq]
+        
+        machine: [
+          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
+        ]
+
+    runs-on: ${{ matrix.machine.runs-on }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Run benchmarks
+        uses: addnab/docker-run-action@v3
+        env:
+          SUBSET: ${{ matrix.subset }}
+          MACHINE: ${{ matrix.machine.name }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        with:
+          image: ${{ env.IMAGE }}
+          options: |
+            --rm
+            --gpus all
+            --shm-size 64G
+            --env SUBSET
+            --env MACHINE
+            --env HF_TOKEN
+            --env MKL_THREADING_LAYER=GNU
+            --env HF_HUB_ENABLE_HF_TRANSFER=1
+            --volume ${{ github.workspace }}:/workspace
+            --workdir /workspace
+          run: |
+            pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
+            pip install -U transformers huggingface_hub[hf_transfer]
+            pip install -e .
+            python llm_perf/hardware/intel/update_llm_perf_cuda_pytorch.py

From c31e6cf5bcf9a8f9fc18ac10ee75543cbe0b8720 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 22 Aug 2024 07:15:18 +0000
Subject: [PATCH 07/73] fix workflow

---
 .../update_llm_perf_intel_pytorch.yaml        | 55 -------------------
 .../update_llm_perf_intel_pytorch.yml         |  2 +-
 2 files changed, 1 insertion(+), 56 deletions(-)
 delete mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml
deleted file mode 100644
index 2eb8b906c..000000000
--- a/.github/workflows/update_llm_perf_intel_pytorch.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: Update LLM Perf Benchmarks - Intel PyTorch
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
-
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda
-
-jobs:
-  run_benchmarks:
-    strategy:
-      fail-fast: false
-      matrix:
-        subset: [unquantized, bnb, awq, gptq]
-        
-        machine: [
-          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
-        ]
-
-    runs-on: ${{ matrix.machine.runs-on }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Run benchmarks
-        uses: addnab/docker-run-action@v3
-        env:
-          SUBSET: ${{ matrix.subset }}
-          MACHINE: ${{ matrix.machine.name }}
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        with:
-          image: ${{ env.IMAGE }}
-          options: |
-            --rm
-            --gpus all
-            --shm-size 64G
-            --env SUBSET
-            --env MACHINE
-            --env HF_TOKEN
-            --env MKL_THREADING_LAYER=GNU
-            --env HF_HUB_ENABLE_HF_TRANSFER=1
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
-            pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .
-            python llm_perf/hardware/intel/update_llm_perf_cuda_pytorch.py
diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index b7254edb9..2eb8b906c 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -1,4 +1,4 @@
-name: Update LLM Perf Benchmarks - CUDA PyTorch
+name: Update LLM Perf Benchmarks - Intel PyTorch
 
 on:
   workflow_dispatch:

From d4064401899fe76785197b60cd96f5fb5562bb18 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 22 Aug 2024 10:51:21 +0000
Subject: [PATCH 08/73] fix failing tests

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index 2eb8b906c..9d3fd26ec 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -10,7 +10,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
 
 env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda
+  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
 
 jobs:
   run_benchmarks:

From 20b96b2f6c7d5145e601590e5444cd2519a221b9 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 22 Aug 2024 11:05:20 +0000
Subject: [PATCH 09/73] fix failing tests

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index 9d3fd26ec..4de8077db 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -39,7 +39,6 @@ jobs:
           image: ${{ env.IMAGE }}
           options: |
             --rm
-            --gpus all
             --shm-size 64G
             --env SUBSET
             --env MACHINE

From c7e0ec0d7a413435661a8e501a5d2f5a8b39bb84 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 22 Aug 2024 11:17:46 +0000
Subject: [PATCH 10/73] fix failing tests

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index 4de8077db..6ce8aafbf 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -51,4 +51,4 @@ jobs:
             pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
-            python llm_perf/hardware/intel/update_llm_perf_cuda_pytorch.py
+            python llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py

From 6d7bf692cf8057067d9184bab5a0b28b8792fa79 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 22 Aug 2024 11:51:39 +0000
Subject: [PATCH 11/73] fix failing tests

---
 .github/workflows/update_llm_perf_intel_pytorch.yml      | 4 ++--
 llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index 6ce8aafbf..b285544aa 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        subset: [unquantized, bnb, awq, gptq]
+        subset: [unquantized]
         
         machine: [
           {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
@@ -48,7 +48,7 @@ jobs:
             --volume ${{ github.workspace }}:/workspace
             --workdir /workspace
           run: |
-            pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
+            pip install packaging && pip install einops scipy optimum codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
             python llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
index bceb89d58..797cfb2f6 100644
--- a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
+++ b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
@@ -127,7 +127,6 @@ def benchmark_intel_pytorch(model, attn_implementation, weights_config):
     backend_config = PyTorchConfig(
         model=model,
         device="cpu",
-        device_ids="0",
         no_weights=True,
         library="transformers",
         task="text-generation",

From 7048df5f649a97078fd806077cb7dfb2054b2d86 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 2 Sep 2024 12:24:05 +0000
Subject: [PATCH 12/73] refractoring

---
 .../hardware/intel/update_llm_perf_intel.py   | 211 ++++++++++++++++++
 1 file changed, 211 insertions(+)
 create mode 100644 llm_perf/hardware/intel/update_llm_perf_intel.py

diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py
new file mode 100644
index 000000000..4ec3a09f5
--- /dev/null
+++ b/llm_perf/hardware/intel/update_llm_perf_intel.py
@@ -0,0 +1,211 @@
+import os
+import traceback
+from itertools import product
+from logging import getLogger
+
+from llm_perf.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    GENERATE_KWARGS,
+    INPUT_SHAPES,
+    OPEN_LLM_LIST,
+    PRETRAINED_OPEN_LLM_LIST,
+    is_benchmark_conducted,
+    is_benchmark_supported,
+)
+from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig, ORTConfig, OVConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+SUBSET = os.getenv("SUBSET", None)
+MACHINE = os.getenv("MACHINE", None)
+HARDWARE = "intel"
+
+
+if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
+    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug"
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
+    SUBSET = "unquantized"
+elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}"
+else:
+    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
+
+ATTENTION_CONFIGS = ["eager", "sdpa"]
+BACKENDS = ["pytorch", "onnxruntime", "openvino"]
+
+
+if SUBSET == "unquantized":
+    WEIGHTS_CONFIGS = {
+        # unquantized
+        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+    }
+elif SUBSET == "bnb":
+    WEIGHTS_CONFIGS = {
+        # bnb
+        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
+        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
+    }
+elif SUBSET == "gptq":
+    WEIGHTS_CONFIGS = {
+        # gptq
+        "4bit-gptq-exllama-v1": {
+            "quant_scheme": "gptq",
+            "torch_dtype": "float16",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
+        },
+        "4bit-gptq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "gptq",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
+        },
+    }
+elif SUBSET == "awq":
+    WEIGHTS_CONFIGS = {
+        # awq
+        "4bit-awq-gemm": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemm"},
+        },
+        "4bit-awq-gemv": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemv"},
+        },
+        "4bit-awq-exllama-v1": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+        "4bit-awq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+    }
+
+
+LOGGER = getLogger("llm-perf-backend")
+LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
+LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
+LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
+
+
+def benchmark_intel(model, attn_implementation, weights_config, backend):
+    benchmark_name = f"{weights_config}-{attn_implementation}-{backend}"
+    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
+    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
+    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
+
+    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
+        return
+
+    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
+        return
+
+    launcher_config = ProcessConfig()
+    scenario_config = InferenceConfig(
+        memory=True,
+        energy=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes=INPUT_SHAPES,
+        generate_kwargs=GENERATE_KWARGS,
+    )
+
+    if backend == "pytorch":
+        backend_config = PyTorchConfig(
+            model=model,
+            device="cpu",
+            no_weights=True,
+            library="transformers",
+            task="text-generation",
+            torch_dtype=torch_dtype,
+            quantization_scheme=quant_scheme,
+            quantization_config=quant_config,
+            attn_implementation=attn_implementation,
+            model_kwargs={"trust_remote_code": True},
+        )
+    elif backend == "onnxruntime":
+        backend_config = ORTConfig(
+            model=model,
+            device="cpu",
+            device_ids="0",
+            no_weights=True,
+            library="transformers",
+            task="text-generation",
+            torch_dtype=torch_dtype,
+            quantization_config=quant_config,
+            model_kwargs={"trust_remote_code": True},
+        )
+    elif backend == "openvino":
+        backend_config = OVConfig(
+            model=model,
+            device="cpu",
+            device_ids="0",
+            no_weights=True,
+            library="transformers",
+            task="text-generation",
+            quantization_config=quant_config,
+            model_kwargs={"trust_remote_code": True},
+        )
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+
+    benchmark_config = BenchmarkConfig(
+        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
+    )
+
+    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    try:
+        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
+        benchmark_report = Benchmark.launch(benchmark_config)
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    except Exception:
+        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
+        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+
+if __name__ == "__main__":
+    # for isolated process
+    os.environ["LOG_TO_FILE"] = "0"
+    os.environ["LOG_LEVEL"] = "INFO"
+
+    # for main process
+    setup_logging(level="INFO", prefix="MAIN-PROCESS")
+
+    models_attentions_weights = list(
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys(), BACKENDS)
+    )
+
+    LOGGER.info(
+        f"Running a total of {len(models_attentions_weights)} benchmarks, "
+        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
+        f"{len(ATTENTION_CONFIGS)} attentions implementations "
+        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
+    )
+
+    for model, attn_implementation, weights_config, backend in models_attentions_weights:
+        benchmark_intel(model, attn_implementation, weights_config, backend)

From db88b2af8f4be21bdff36b6c1283590ccf6ec804 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 2 Sep 2024 12:30:16 +0000
Subject: [PATCH 13/73] intel with multiple backends

---
 .github/workflows/update_llm_perf_intel_pytorch.yml |  2 +-
 llm_perf/hardware/intel/update_llm_perf_intel.py    | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index b285544aa..d43277f66 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -51,4 +51,4 @@ jobs:
             pip install packaging && pip install einops scipy optimum codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
-            python llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
+            python llm_perf/hardware/intel/update_llm_perf_intel.py
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py
index 4ec3a09f5..325ff9138 100644
--- a/llm_perf/hardware/intel/update_llm_perf_intel.py
+++ b/llm_perf/hardware/intel/update_llm_perf_intel.py
@@ -12,7 +12,16 @@
     is_benchmark_conducted,
     is_benchmark_supported,
 )
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig, ORTConfig, OVConfig
+from optimum_benchmark import (
+    Benchmark,
+    BenchmarkConfig,
+    BenchmarkReport,
+    InferenceConfig,
+    ORTConfig,
+    OVConfig,
+    ProcessConfig,
+    PyTorchConfig,
+)
 from optimum_benchmark.logging_utils import setup_logging
 
 SUBSET = os.getenv("SUBSET", None)

From 1246d28ef590c87cd0ce4e8c5c3a99f157eee6e6 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 2 Sep 2024 18:31:32 +0000
Subject: [PATCH 14/73] parallelize intel llm-perf

---
 .../update_llm_perf_intel_pytorch.yml         |  2 +-
 .../hardware/intel/update_llm_perf_intel.py   | 20 +++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index d43277f66..d66f23b7f 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         subset: [unquantized]
-        
+        backend: [pytorch, onnxruntime, openvino]
         machine: [
           {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
         ]
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py
index 325ff9138..58f0a3666 100644
--- a/llm_perf/hardware/intel/update_llm_perf_intel.py
+++ b/llm_perf/hardware/intel/update_llm_perf_intel.py
@@ -27,6 +27,7 @@
 SUBSET = os.getenv("SUBSET", None)
 MACHINE = os.getenv("MACHINE", None)
 HARDWARE = "intel"
+BACKEND = os.getenv("BACKEND", None)
 
 
 if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
@@ -39,7 +40,6 @@
     raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
 
 ATTENTION_CONFIGS = ["eager", "sdpa"]
-BACKENDS = ["pytorch", "onnxruntime", "openvino"]
 
 
 if SUBSET == "unquantized":
@@ -109,8 +109,8 @@
 LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
 
 
-def benchmark_intel(model, attn_implementation, weights_config, backend):
-    benchmark_name = f"{weights_config}-{attn_implementation}-{backend}"
+def benchmark_intel(model, attn_implementation, weights_config):
+    benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
     subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
 
     torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
@@ -137,7 +137,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend):
         generate_kwargs=GENERATE_KWARGS,
     )
 
-    if backend == "pytorch":
+    if BACKEND == "pytorch":
         backend_config = PyTorchConfig(
             model=model,
             device="cpu",
@@ -150,7 +150,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend):
             attn_implementation=attn_implementation,
             model_kwargs={"trust_remote_code": True},
         )
-    elif backend == "onnxruntime":
+    elif BACKEND == "onnxruntime":
         backend_config = ORTConfig(
             model=model,
             device="cpu",
@@ -162,7 +162,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend):
             quantization_config=quant_config,
             model_kwargs={"trust_remote_code": True},
         )
-    elif backend == "openvino":
+    elif BACKEND == "openvino":
         backend_config = OVConfig(
             model=model,
             device="cpu",
@@ -174,7 +174,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend):
             model_kwargs={"trust_remote_code": True},
         )
     else:
-        raise ValueError(f"Unsupported backend: {backend}")
+        raise ValueError(f"Unsupported backend: {BACKEND}")
 
     benchmark_config = BenchmarkConfig(
         name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
@@ -206,7 +206,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend):
     setup_logging(level="INFO", prefix="MAIN-PROCESS")
 
     models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys(), BACKENDS)
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
     )
 
     LOGGER.info(
@@ -216,5 +216,5 @@ def benchmark_intel(model, attn_implementation, weights_config, backend):
         f"and {len(WEIGHTS_CONFIGS)} weights configurations."
     )
 
-    for model, attn_implementation, weights_config, backend in models_attentions_weights:
-        benchmark_intel(model, attn_implementation, weights_config, backend)
+    for model, attn_implementation, weights_config in models_attentions_weights:
+        benchmark_intel(model, attn_implementation, weights_config)

From 2d6830e12eca7d7f7736ee8a685fff9435613933 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 2 Sep 2024 18:36:21 +0000
Subject: [PATCH 15/73] parallelize intel llm-perf

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index d66f23b7f..9719ed3ec 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -33,6 +33,7 @@ jobs:
         uses: addnab/docker-run-action@v3
         env:
           SUBSET: ${{ matrix.subset }}
+          BACKEND: ${{ matrix.backend }}
           MACHINE: ${{ matrix.machine.name }}
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         with:

From 801c5bfef8793bb0597673addc7ffaff711126d6 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 2 Sep 2024 18:44:07 +0000
Subject: [PATCH 16/73] parallelize intel llm-perf

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 4 ++--
 llm_perf/hardware/intel/update_llm_perf_intel.py    | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index 9719ed3ec..31c75e874 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -17,8 +17,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        subset: [unquantized]
         backend: [pytorch, onnxruntime, openvino]
+        subset: [unquantized]
         machine: [
           {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
         ]
@@ -32,8 +32,8 @@ jobs:
       - name: Run benchmarks
         uses: addnab/docker-run-action@v3
         env:
-          SUBSET: ${{ matrix.subset }}
           BACKEND: ${{ matrix.backend }}
+          SUBSET: ${{ matrix.subset }}
           MACHINE: ${{ matrix.machine.name }}
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         with:
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py
index 58f0a3666..f91eb5a23 100644
--- a/llm_perf/hardware/intel/update_llm_perf_intel.py
+++ b/llm_perf/hardware/intel/update_llm_perf_intel.py
@@ -26,9 +26,8 @@
 
 SUBSET = os.getenv("SUBSET", None)
 MACHINE = os.getenv("MACHINE", None)
-HARDWARE = "intel"
 BACKEND = os.getenv("BACKEND", None)
-
+HARDWARE = "intel"
 
 if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
     PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug"

From 2e9526c7da3d933750fcf79d17715c629d0ffaa0 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 2 Sep 2024 18:59:51 +0000
Subject: [PATCH 17/73] parallelize intel llm-perf

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index 31c75e874..65a9bf73c 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -41,6 +41,7 @@ jobs:
           options: |
             --rm
             --shm-size 64G
+            --env BACKEND
             --env SUBSET
             --env MACHINE
             --env HF_TOKEN

From 6d87d31a280cf5c86c69ced7e3faeca3973858ca Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 2 Sep 2024 20:18:57 +0000
Subject: [PATCH 18/73] parallelize intel llm-perf

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index 65a9bf73c..5de6a8591 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -50,7 +50,7 @@ jobs:
             --volume ${{ github.workspace }}:/workspace
             --workdir /workspace
           run: |
-            pip install packaging && pip install einops scipy optimum codecarbon
+            pip install packaging && pip install einops scipy optimum codecarbon onnxruntime openvino
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
             python llm_perf/hardware/intel/update_llm_perf_intel.py

From 62266a6d7eb473b1e82051be31393a0975dbd581 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 2 Sep 2024 20:26:10 +0000
Subject: [PATCH 19/73] parallelize intel llm-perf

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index 5de6a8591..d48f26d61 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -50,7 +50,7 @@ jobs:
             --volume ${{ github.workspace }}:/workspace
             --workdir /workspace
           run: |
-            pip install packaging && pip install einops scipy optimum codecarbon onnxruntime openvino
+            pip install packaging && pip install einops scipy optimum codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .
+            pip install -e .[onnxruntime, openvino]
             python llm_perf/hardware/intel/update_llm_perf_intel.py

From 0a39667ba612e17e93e1a4c76763400183fd7341 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 3 Sep 2024 05:42:06 +0000
Subject: [PATCH 20/73] parallelize intel llm-perf

---
 .github/workflows/update_llm_perf_intel_pytorch.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
index d48f26d61..c972a8869 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -52,5 +52,5 @@ jobs:
           run: |
             pip install packaging && pip install einops scipy optimum codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .[onnxruntime, openvino]
+            pip install -e .[onnxruntime,openvino]
             python llm_perf/hardware/intel/update_llm_perf_intel.py

From caf7b67393783ea7ff331f9125a9639604969cb2 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 3 Sep 2024 06:00:03 +0000
Subject: [PATCH 21/73] parallelize intel llm-perf

---
 llm_perf/hardware/intel/update_llm_perf_intel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py
index f91eb5a23..8fe64e6d0 100644
--- a/llm_perf/hardware/intel/update_llm_perf_intel.py
+++ b/llm_perf/hardware/intel/update_llm_perf_intel.py
@@ -34,7 +34,7 @@
     CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
     SUBSET = "unquantized"
 elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}"
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}-{BACKEND}"
 else:
     raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
 

From 5890457b58c7c7614046ed1d32f98ce9345178ab Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 3 Sep 2024 11:28:14 +0000
Subject: [PATCH 22/73] parallelize intel llm-perf

---
 llm_perf/hardware/intel/update_llm_perf_intel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py
index 8fe64e6d0..5c760e1bd 100644
--- a/llm_perf/hardware/intel/update_llm_perf_intel.py
+++ b/llm_perf/hardware/intel/update_llm_perf_intel.py
@@ -30,11 +30,11 @@
 HARDWARE = "intel"
 
 if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug"
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug"
     CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
     SUBSET = "unquantized"
 elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}-{BACKEND}"
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}"
 else:
     raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
 

From 50bd1a2e8acb8f6da6c4870a2c01b5a763f13fe4 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Sep 2024 10:06:06 +0000
Subject: [PATCH 23/73] update leaderboard collection to support more hardware

---
 llm_perf/hardware.yml                         |  43 +++++
 .../intel/update_llm_perf_intel_openvino.py   | 177 -----------------
 .../intel/update_llm_perf_intel_ort.py        | 180 -----------------
 .../intel/update_llm_perf_intel_pytorch.py    | 181 ------------------
 .../update_llm_perf_cuda_pytorch.py           |   0
 .../{intel => }/update_llm_perf_intel.py      |   0
 llm_perf/hardware/utils.py                    |  20 ++
 llm_perf/update_llm_perf_leaderboard.py       |  32 +++-
 8 files changed, 86 insertions(+), 547 deletions(-)
 create mode 100644 llm_perf/hardware.yml
 delete mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_openvino.py
 delete mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_ort.py
 delete mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
 rename llm_perf/hardware/{cuda => }/update_llm_perf_cuda_pytorch.py (100%)
 rename llm_perf/hardware/{intel => }/update_llm_perf_intel.py (100%)
 create mode 100644 llm_perf/hardware/utils.py

diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml
new file mode 100644
index 000000000..a69f35e33
--- /dev/null
+++ b/llm_perf/hardware.yml
@@ -0,0 +1,43 @@
+- machine: 1xA10
+  description: A10-24GB-150W 🖥️
+  type: cuda
+  subsets:
+    - unquantized
+    - awq
+    - bnb
+    - gptq
+  backends:
+    - pytorch
+
+- machine: 1xA100
+  description: A100-80GB-275W 🖥️
+  type: cuda
+  subsets:
+    - unquantized
+    - awq
+    - bnb
+    - gptq
+  backends:
+    - pytorch
+
+- machine: 1xT4
+  description: T4-16GB-70W 🖥️
+  type: cuda
+  subsets:
+    - unquantized
+    - awq
+    - bnb
+    - gptq
+  backends:
+    - pytorch
+
+- machine: c7i
+  description: 4th-Gen-Intel-Xeon-385W 🖥️
+  type: intel
+  subsets:
+    - unquantized
+    - awq
+    - bnb
+    - gptq
+  backends:
+    - pytorch
\ No newline at end of file
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py
deleted file mode 100644
index 869598285..000000000
--- a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py
+++ /dev/null
@@ -1,177 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-    is_benchmark_supported,
-)
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, OVConfig, ProcessConfig
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-HARDWARE = "intel"
-
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = "optimum-benchmark/llm-perf-openvino-intel-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-openvino-intel-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa"]
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-elif SUBSET == "bnb":
-    WEIGHTS_CONFIGS = {
-        # bnb
-        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
-        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
-    }
-elif SUBSET == "gptq":
-    WEIGHTS_CONFIGS = {
-        # gptq
-        "4bit-gptq-exllama-v1": {
-            "quant_scheme": "gptq",
-            "torch_dtype": "float16",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
-        },
-        "4bit-gptq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "gptq",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
-        },
-    }
-elif SUBSET == "awq":
-    WEIGHTS_CONFIGS = {
-        # awq
-        "4bit-awq-gemm": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemm"},
-        },
-        "4bit-awq-gemv": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemv"},
-        },
-        "4bit-awq-exllama-v1": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-        "4bit-awq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-    }
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def benchmark_intel_openvino(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig()
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-    backend_config = OVConfig(
-        model=model,
-        device="cpu",
-        device_ids="0",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        quantization_config=quant_config,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_intel_openvino(model, attn_implementation, weights_config)
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py
deleted file mode 100644
index 5b8f5c3eb..000000000
--- a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import os
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-    is_benchmark_supported,
-)
-from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ORTConfig, ProcessConfig
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-HARDWARE = "intel"
-
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = "optimum-benchmark/llm-perf-ort-intel-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-ort-intel-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa"]
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-elif SUBSET == "bnb":
-    WEIGHTS_CONFIGS = {
-        # bnb
-        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
-        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
-    }
-elif SUBSET == "gptq":
-    WEIGHTS_CONFIGS = {
-        # gptq
-        "4bit-gptq-exllama-v1": {
-            "quant_scheme": "gptq",
-            "torch_dtype": "float16",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
-        },
-        "4bit-gptq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "gptq",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
-        },
-    }
-elif SUBSET == "awq":
-    WEIGHTS_CONFIGS = {
-        # awq
-        "4bit-awq-gemm": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemm"},
-        },
-        "4bit-awq-gemv": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemv"},
-        },
-        "4bit-awq-exllama-v1": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-        "4bit-awq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-    }
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def benchmark_intel_ort(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig()
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-    backend_config = ORTConfig(
-        model=model,
-        device="cpu",
-        device_ids="0",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        # quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        # attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    # try:
-    LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-    benchmark_report = Benchmark.launch(benchmark_config)
-    benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-    benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-    benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    # except Exception:
-    #     LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-    #     benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-    #     benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-    #     benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-    #     benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_intel_ort(model, attn_implementation, weights_config)
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
deleted file mode 100644
index 797cfb2f6..000000000
--- a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-    is_benchmark_supported,
-)
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-HARDWARE = "intel"
-
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa"]
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-elif SUBSET == "bnb":
-    WEIGHTS_CONFIGS = {
-        # bnb
-        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
-        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
-    }
-elif SUBSET == "gptq":
-    WEIGHTS_CONFIGS = {
-        # gptq
-        "4bit-gptq-exllama-v1": {
-            "quant_scheme": "gptq",
-            "torch_dtype": "float16",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
-        },
-        "4bit-gptq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "gptq",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
-        },
-    }
-elif SUBSET == "awq":
-    WEIGHTS_CONFIGS = {
-        # awq
-        "4bit-awq-gemm": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemm"},
-        },
-        "4bit-awq-gemv": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemv"},
-        },
-        "4bit-awq-exllama-v1": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-        "4bit-awq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-    }
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def benchmark_intel_pytorch(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig()
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cpu",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_intel_pytorch(model, attn_implementation, weights_config)
diff --git a/llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware/update_llm_perf_cuda_pytorch.py
similarity index 100%
rename from llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py
rename to llm_perf/hardware/update_llm_perf_cuda_pytorch.py
diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/update_llm_perf_intel.py
similarity index 100%
rename from llm_perf/hardware/intel/update_llm_perf_intel.py
rename to llm_perf/hardware/update_llm_perf_intel.py
diff --git a/llm_perf/hardware/utils.py b/llm_perf/hardware/utils.py
new file mode 100644
index 000000000..a8cfbfc6d
--- /dev/null
+++ b/llm_perf/hardware/utils.py
@@ -0,0 +1,20 @@
+from typing import Any, Dict, List
+import yaml
+
+class HardwareConfig:
+    def __init__(self, data: Dict[str, Any]):
+        self.machine = data['machine']
+        self.description = data['description']
+        self.type = data['type']
+        self.subsets = data['subsets']
+        self.backends = data['backends']
+        
+
+    def __repr__(self):
+        return f"HardwareConfig(machine='{self.machine}', description='{self.description}', " \
+               f"type={self.type}, subsets={self.subsets}, backends={self.backends})"
+
+def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
+    with open(file_path, 'r') as file:
+        data = yaml.safe_load(file)
+    return [HardwareConfig(config) for config in data]
\ No newline at end of file
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 9c8763e63..d3e3e3650 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -5,18 +5,22 @@
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
 
+from .hardware.utils import load_hardware_configs
 from optimum_benchmark import Benchmark
 
 REPO_TYPE = "dataset"
 MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
-PERF_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-{subset}-{machine}"
+PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}"
 
 PERF_DF = "perf-df-{subset}-{machine}.csv"
 LLM_DF = "llm-df.csv"
 
 
-def gather_benchmarks(subset: str, machine: str):
-    perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine)
+def gather_benchmarks(subset: str, machine: str, backend: str, hardware_type: str):
+    """
+    Gather the benchmarks for a given subset and machine
+    """
+    perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_type=hardware_type)
     snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"])
 
     dfs = []
@@ -31,12 +35,19 @@ def gather_benchmarks(subset: str, machine: str):
 
 
 def update_perf_dfs():
-    for subset in ["unquantized", "bnb", "awq", "gptq"]:
-        for machine in ["1xA10", "1xA100", "1xT4"]:
-            try:
-                gather_benchmarks(subset, machine)
-            except Exception:
-                print(f"Subset {subset} for machine {machine} not found")
+    """
+    Update the performance dataframes for all subsets and machines
+    """
+    hardware_configs = load_hardware_configs("hardware.yml")
+
+
+    for hardware_config in hardware_configs:
+        for subset in hardware_config.subsets:
+            for backend in hardware_config.backends:
+                try:
+                    gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.type)
+                except Exception:
+                    print(f"Subset {subset} for machine {hardware_config.machine} not found")
 
 
 scrapping_script = """
@@ -48,6 +59,9 @@ def update_perf_dfs():
 
 
 def update_llm_df():
+    """
+    Scrape the open-llm-leaderboard and update the leaderboard dataframe
+    """
     subprocess.run(scrapping_script, shell=True)
     create_repo(repo_id=MAIN_REPO_ID, repo_type=REPO_TYPE, exist_ok=True, private=False)
     upload_file(

From f93cc7c0986758e2ec435d8fb2499473392a543d Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Sep 2024 10:09:40 +0000
Subject: [PATCH 24/73] update leaderboard collection to support more hardware

---
 llm_perf/hardware/utils.py              | 24 ++++++++++++++----------
 llm_perf/update_llm_perf_leaderboard.py |  4 ++--
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/llm_perf/hardware/utils.py b/llm_perf/hardware/utils.py
index a8cfbfc6d..bf24f7f3c 100644
--- a/llm_perf/hardware/utils.py
+++ b/llm_perf/hardware/utils.py
@@ -1,20 +1,24 @@
 from typing import Any, Dict, List
+
 import yaml
 
+
 class HardwareConfig:
     def __init__(self, data: Dict[str, Any]):
-        self.machine = data['machine']
-        self.description = data['description']
-        self.type = data['type']
-        self.subsets = data['subsets']
-        self.backends = data['backends']
-        
+        self.machine = data["machine"]
+        self.description = data["description"]
+        self.type = data["type"]
+        self.subsets = data["subsets"]
+        self.backends = data["backends"]
 
     def __repr__(self):
-        return f"HardwareConfig(machine='{self.machine}', description='{self.description}', " \
-               f"type={self.type}, subsets={self.subsets}, backends={self.backends})"
+        return (
+            f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
+            f"type={self.type}, subsets={self.subsets}, backends={self.backends})"
+        )
+
 
 def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
-    with open(file_path, 'r') as file:
+    with open(file_path, "r") as file:
         data = yaml.safe_load(file)
-    return [HardwareConfig(config) for config in data]
\ No newline at end of file
+    return [HardwareConfig(config) for config in data]
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index d3e3e3650..27b2a0812 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -5,9 +5,10 @@
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
 
-from .hardware.utils import load_hardware_configs
 from optimum_benchmark import Benchmark
 
+from .hardware.utils import load_hardware_configs
+
 REPO_TYPE = "dataset"
 MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
 PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}"
@@ -40,7 +41,6 @@ def update_perf_dfs():
     """
     hardware_configs = load_hardware_configs("hardware.yml")
 
-
     for hardware_config in hardware_configs:
         for subset in hardware_config.subsets:
             for backend in hardware_config.backends:

From 2fad5931f38f719151d2067955488a7a842822ea Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Sep 2024 10:11:23 +0000
Subject: [PATCH 25/73] update leaderboard collection to support more hardware

---
 llm_perf/update_llm_perf_leaderboard.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 27b2a0812..544fc4b2a 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -2,13 +2,12 @@
 from glob import glob
 
 import pandas as pd
+from hardware.utils import load_hardware_configs
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
 
 from optimum_benchmark import Benchmark
 
-from .hardware.utils import load_hardware_configs
-
 REPO_TYPE = "dataset"
 MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
 PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}"

From 6f2885c1835be9d25b9378805579d3f97b53755e Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Sep 2024 10:16:22 +0000
Subject: [PATCH 26/73] update leaderboard collection to support more hardware

---
 llm_perf/update_llm_perf_leaderboard.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 544fc4b2a..1005cb3ad 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -38,7 +38,7 @@ def update_perf_dfs():
     """
     Update the performance dataframes for all subsets and machines
     """
-    hardware_configs = load_hardware_configs("hardware.yml")
+    hardware_configs = load_hardware_configs("llm_perf/hardware.yml")
 
     for hardware_config in hardware_configs:
         for subset in hardware_config.subsets:

From a59e55403215e3a4c734426276f5a4097a981886 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Sep 2024 11:12:56 +0000
Subject: [PATCH 27/73] update leaderboard collection to support more hardware

---
 llm_perf/hardware.yml                   | 8 ++++----
 llm_perf/hardware/utils.py              | 4 ++--
 llm_perf/update_llm_perf_leaderboard.py | 9 +++------
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml
index a69f35e33..0d9581793 100644
--- a/llm_perf/hardware.yml
+++ b/llm_perf/hardware.yml
@@ -1,6 +1,6 @@
 - machine: 1xA10
   description: A10-24GB-150W 🖥️
-  type: cuda
+  hardware_type: cuda
   subsets:
     - unquantized
     - awq
@@ -11,7 +11,7 @@
 
 - machine: 1xA100
   description: A100-80GB-275W 🖥️
-  type: cuda
+  hardware_type: cuda
   subsets:
     - unquantized
     - awq
@@ -22,7 +22,7 @@
 
 - machine: 1xT4
   description: T4-16GB-70W 🖥️
-  type: cuda
+  hardware_type: cuda
   subsets:
     - unquantized
     - awq
@@ -33,7 +33,7 @@
 
 - machine: c7i
   description: 4th-Gen-Intel-Xeon-385W 🖥️
-  type: intel
+  hardware_type: intel
   subsets:
     - unquantized
     - awq
diff --git a/llm_perf/hardware/utils.py b/llm_perf/hardware/utils.py
index bf24f7f3c..71c9ce706 100644
--- a/llm_perf/hardware/utils.py
+++ b/llm_perf/hardware/utils.py
@@ -7,14 +7,14 @@ class HardwareConfig:
     def __init__(self, data: Dict[str, Any]):
         self.machine = data["machine"]
         self.description = data["description"]
-        self.type = data["type"]
+        self.hardware_type = data["hardware_type"]
         self.subsets = data["subsets"]
         self.backends = data["backends"]
 
     def __repr__(self):
         return (
             f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
-            f"type={self.type}, subsets={self.subsets}, backends={self.backends})"
+            f"hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})"
         )
 
 
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 1005cb3ad..283fc6330 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -10,7 +10,7 @@
 
 REPO_TYPE = "dataset"
 MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
-PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}"
+PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_type}-{subset}-{machine}"
 
 PERF_DF = "perf-df-{subset}-{machine}.csv"
 LLM_DF = "llm-df.csv"
@@ -43,10 +43,7 @@ def update_perf_dfs():
     for hardware_config in hardware_configs:
         for subset in hardware_config.subsets:
             for backend in hardware_config.backends:
-                try:
-                    gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.type)
-                except Exception:
-                    print(f"Subset {subset} for machine {hardware_config.machine} not found")
+                gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type)
 
 
 scrapping_script = """
@@ -69,5 +66,5 @@ def update_llm_df():
 
 
 if __name__ == "__main__":
-    update_llm_df()
+    # update_llm_df()
     update_perf_dfs()

From a1937482b48f745078b798e32cc8f54b5e8b04cc Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Sep 2024 11:14:29 +0000
Subject: [PATCH 28/73] update leaderboard collection to support more hardware

---
 llm_perf/update_llm_perf_leaderboard.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 283fc6330..f0acd4751 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -18,7 +18,7 @@
 
 def gather_benchmarks(subset: str, machine: str, backend: str, hardware_type: str):
     """
-    Gather the benchmarks for a given subset and machine
+    Gather the benchmarks for a given machine
     """
     perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_type=hardware_type)
     snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"])
@@ -36,7 +36,7 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware_type: st
 
 def update_perf_dfs():
     """
-    Update the performance dataframes for all subsets and machines
+    Update the performance dataframes for all machines
     """
     hardware_configs = load_hardware_configs("llm_perf/hardware.yml")
 
@@ -66,5 +66,5 @@ def update_llm_df():
 
 
 if __name__ == "__main__":
-    # update_llm_df()
+    update_llm_df()
     update_perf_dfs()

From 31f1ff66be19fd8d3ae41e5110af01bd372bfef5 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Wed, 4 Sep 2024 11:26:10 +0000
Subject: [PATCH 29/73] update leaderboard collection to support more hardware

---
 llm_perf/update_llm_perf_leaderboard.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index f0acd4751..0f4964156 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -43,7 +43,10 @@ def update_perf_dfs():
     for hardware_config in hardware_configs:
         for subset in hardware_config.subsets:
             for backend in hardware_config.backends:
-                gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type)
+                try:
+                    gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type)
+                except Exception as e:
+                    print(f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_type} and {subset} with {backend}: {e}")
 
 
 scrapping_script = """

From 0f041fbba3a1824cb6389dd9c643bd98dcccd2e4 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 08:33:18 +0000
Subject: [PATCH 30/73] add new workflow

---
 ...ml => update_llm_perf_intel_openvino.yaml} |  0
 .../update_llm_perf_intel_pytorch.yaml        | 56 +++++++++++++++++++
 llm_perf/hardware.yml                         |  7 +--
 3 files changed, 59 insertions(+), 4 deletions(-)
 rename .github/workflows/{update_llm_perf_intel_pytorch.yml => update_llm_perf_intel_openvino.yaml} (100%)
 create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_openvino.yaml
similarity index 100%
rename from .github/workflows/update_llm_perf_intel_pytorch.yml
rename to .github/workflows/update_llm_perf_intel_openvino.yaml
diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml
new file mode 100644
index 000000000..c972a8869
--- /dev/null
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml
@@ -0,0 +1,56 @@
+name: Update LLM Perf Benchmarks - Intel PyTorch
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+env:
+  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
+
+jobs:
+  run_benchmarks:
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: [pytorch, onnxruntime, openvino]
+        subset: [unquantized]
+        machine: [
+          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
+        ]
+
+    runs-on: ${{ matrix.machine.runs-on }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Run benchmarks
+        uses: addnab/docker-run-action@v3
+        env:
+          BACKEND: ${{ matrix.backend }}
+          SUBSET: ${{ matrix.subset }}
+          MACHINE: ${{ matrix.machine.name }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        with:
+          image: ${{ env.IMAGE }}
+          options: |
+            --rm
+            --shm-size 64G
+            --env BACKEND
+            --env SUBSET
+            --env MACHINE
+            --env HF_TOKEN
+            --env MKL_THREADING_LAYER=GNU
+            --env HF_HUB_ENABLE_HF_TRANSFER=1
+            --volume ${{ github.workspace }}:/workspace
+            --workdir /workspace
+          run: |
+            pip install packaging && pip install einops scipy optimum codecarbon
+            pip install -U transformers huggingface_hub[hf_transfer]
+            pip install -e .[onnxruntime,openvino]
+            python llm_perf/hardware/intel/update_llm_perf_intel.py
diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml
index 0d9581793..d4a6bfb56 100644
--- a/llm_perf/hardware.yml
+++ b/llm_perf/hardware.yml
@@ -36,8 +36,7 @@
   hardware_type: intel
   subsets:
     - unquantized
-    - awq
-    - bnb
-    - gptq
   backends:
-    - pytorch
\ No newline at end of file
+    - pytorch
+    - onnxruntime
+    - openvino
\ No newline at end of file

From b2330b065170c556c14fd49f79cf59313e21c4e5 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 08:35:26 +0000
Subject: [PATCH 31/73] add new workflow

---
 .github/workflows/update_llm_perf_intel_openvino.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml
index c972a8869..8558b899e 100644
--- a/.github/workflows/update_llm_perf_intel_openvino.yaml
+++ b/.github/workflows/update_llm_perf_intel_openvino.yaml
@@ -1,4 +1,4 @@
-name: Update LLM Perf Benchmarks - Intel PyTorch
+name: Update LLM Perf Benchmarks - Intel Openvino
 
 on:
   workflow_dispatch:

From 2f54e2dde6c071e68819be5680cf307bd11a2f6f Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 08:36:13 +0000
Subject: [PATCH 32/73] add new workflow

---
 llm_perf/update_llm_perf_leaderboard.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 0f4964156..e8308d59b 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -46,7 +46,9 @@ def update_perf_dfs():
                 try:
                     gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type)
                 except Exception as e:
-                    print(f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_type} and {subset} with {backend}: {e}")
+                    print(
+                        f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_type} and {subset} with {backend}: {e}"
+                    )
 
 
 scrapping_script = """

From 8603b682a88d0b080ccb97599eb8edb4919b30f0 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 08:38:18 +0000
Subject: [PATCH 33/73] add new workflow

---
 .../update_llm_perf_intel_openvino.yaml       | 56 -------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 .github/workflows/update_llm_perf_intel_openvino.yaml

diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml
deleted file mode 100644
index 8558b899e..000000000
--- a/.github/workflows/update_llm_perf_intel_openvino.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: Update LLM Perf Benchmarks - Intel Openvino
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
-
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
-
-jobs:
-  run_benchmarks:
-    strategy:
-      fail-fast: false
-      matrix:
-        backend: [pytorch, onnxruntime, openvino]
-        subset: [unquantized]
-        machine: [
-          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
-        ]
-
-    runs-on: ${{ matrix.machine.runs-on }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Run benchmarks
-        uses: addnab/docker-run-action@v3
-        env:
-          BACKEND: ${{ matrix.backend }}
-          SUBSET: ${{ matrix.subset }}
-          MACHINE: ${{ matrix.machine.name }}
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        with:
-          image: ${{ env.IMAGE }}
-          options: |
-            --rm
-            --shm-size 64G
-            --env BACKEND
-            --env SUBSET
-            --env MACHINE
-            --env HF_TOKEN
-            --env MKL_THREADING_LAYER=GNU
-            --env HF_HUB_ENABLE_HF_TRANSFER=1
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install packaging && pip install einops scipy optimum codecarbon
-            pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .[onnxruntime,openvino]
-            python llm_perf/hardware/intel/update_llm_perf_intel.py

From ec829cb70765433a61336ee46634ae866ed0cde6 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 08:38:36 +0000
Subject: [PATCH 34/73] add new workflow

---
 .../update_llm_perf_intel_pytorch.yaml        | 56 -------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml
deleted file mode 100644
index c972a8869..000000000
--- a/.github/workflows/update_llm_perf_intel_pytorch.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: Update LLM Perf Benchmarks - Intel PyTorch
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
-
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
-
-jobs:
-  run_benchmarks:
-    strategy:
-      fail-fast: false
-      matrix:
-        backend: [pytorch, onnxruntime, openvino]
-        subset: [unquantized]
-        machine: [
-          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
-        ]
-
-    runs-on: ${{ matrix.machine.runs-on }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Run benchmarks
-        uses: addnab/docker-run-action@v3
-        env:
-          BACKEND: ${{ matrix.backend }}
-          SUBSET: ${{ matrix.subset }}
-          MACHINE: ${{ matrix.machine.name }}
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        with:
-          image: ${{ env.IMAGE }}
-          options: |
-            --rm
-            --shm-size 64G
-            --env BACKEND
-            --env SUBSET
-            --env MACHINE
-            --env HF_TOKEN
-            --env MKL_THREADING_LAYER=GNU
-            --env HF_HUB_ENABLE_HF_TRANSFER=1
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install packaging && pip install einops scipy optimum codecarbon
-            pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .[onnxruntime,openvino]
-            python llm_perf/hardware/intel/update_llm_perf_intel.py

From d152c8130e0f36acab878caa1dd0e77264fd97d7 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 08:45:17 +0000
Subject: [PATCH 35/73] add new workflow

---
 .../update_llm_perf_intel_openvino.yaml       | 56 +++++++++++++++++++
 .../update_llm_perf_intel_pytorch.yml         | 56 +++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 .github/workflows/update_llm_perf_intel_openvino.yaml
 create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yml

diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml
new file mode 100644
index 000000000..8558b899e
--- /dev/null
+++ b/.github/workflows/update_llm_perf_intel_openvino.yaml
@@ -0,0 +1,56 @@
+name: Update LLM Perf Benchmarks - Intel Openvino
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+env:
+  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
+
+jobs:
+  run_benchmarks:
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: [pytorch, onnxruntime, openvino]
+        subset: [unquantized]
+        machine: [
+          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
+        ]
+
+    runs-on: ${{ matrix.machine.runs-on }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Run benchmarks
+        uses: addnab/docker-run-action@v3
+        env:
+          BACKEND: ${{ matrix.backend }}
+          SUBSET: ${{ matrix.subset }}
+          MACHINE: ${{ matrix.machine.name }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        with:
+          image: ${{ env.IMAGE }}
+          options: |
+            --rm
+            --shm-size 64G
+            --env BACKEND
+            --env SUBSET
+            --env MACHINE
+            --env HF_TOKEN
+            --env MKL_THREADING_LAYER=GNU
+            --env HF_HUB_ENABLE_HF_TRANSFER=1
+            --volume ${{ github.workspace }}:/workspace
+            --workdir /workspace
+          run: |
+            pip install packaging && pip install einops scipy optimum codecarbon
+            pip install -U transformers huggingface_hub[hf_transfer]
+            pip install -e .[onnxruntime,openvino]
+            python llm_perf/hardware/intel/update_llm_perf_intel.py
diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
new file mode 100644
index 000000000..c972a8869
--- /dev/null
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yml
@@ -0,0 +1,56 @@
+name: Update LLM Perf Benchmarks - Intel PyTorch
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+env:
+  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
+
+jobs:
+  run_benchmarks:
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: [pytorch, onnxruntime, openvino]
+        subset: [unquantized]
+        machine: [
+          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
+        ]
+
+    runs-on: ${{ matrix.machine.runs-on }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Run benchmarks
+        uses: addnab/docker-run-action@v3
+        env:
+          BACKEND: ${{ matrix.backend }}
+          SUBSET: ${{ matrix.subset }}
+          MACHINE: ${{ matrix.machine.name }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        with:
+          image: ${{ env.IMAGE }}
+          options: |
+            --rm
+            --shm-size 64G
+            --env BACKEND
+            --env SUBSET
+            --env MACHINE
+            --env HF_TOKEN
+            --env MKL_THREADING_LAYER=GNU
+            --env HF_HUB_ENABLE_HF_TRANSFER=1
+            --volume ${{ github.workspace }}:/workspace
+            --workdir /workspace
+          run: |
+            pip install packaging && pip install einops scipy optimum codecarbon
+            pip install -U transformers huggingface_hub[hf_transfer]
+            pip install -e .[onnxruntime,openvino]
+            python llm_perf/hardware/intel/update_llm_perf_intel.py

From 9730b0bd398eeaf203b9af46e851199527f861ce Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 09:05:41 +0000
Subject: [PATCH 36/73] add new workflow

---
 .github/workflows/update_llm_perf_intel_openvino.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml
index 8558b899e..943d16d8f 100644
--- a/.github/workflows/update_llm_perf_intel_openvino.yaml
+++ b/.github/workflows/update_llm_perf_intel_openvino.yaml
@@ -2,6 +2,7 @@ name: Update LLM Perf Benchmarks - Intel Openvino
 
 on:
   workflow_dispatch:
+  push:
   schedule:
     - cron: "0 0 * * *"
 

From 6e9d33c940ad88322457bf240d826b638b554ba7 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 10:03:13 +0000
Subject: [PATCH 37/73] add new workflow

---
 .../update_llm_perf_cuda_pytorch.yaml         |   2 +-
 .../update_llm_perf_intel_openvino.yaml       |  57 -----
 .../update_llm_perf_intel_pytorch.yml         |  56 -----
 llm_perf/hardware.yml                         |  12 +-
 .../hardware/update_llm_perf_cuda_pytorch.py  | 182 ---------------
 llm_perf/hardware/update_llm_perf_intel.py    | 219 ------------------
 llm_perf/hardware/utils.py                    |  24 --
 llm_perf/update_llm_perf_leaderboard.py       |   2 +-
 llm_perf/utils.py                             |  32 +++
 9 files changed, 42 insertions(+), 544 deletions(-)
 delete mode 100644 .github/workflows/update_llm_perf_intel_openvino.yaml
 delete mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yml
 delete mode 100644 llm_perf/hardware/update_llm_perf_cuda_pytorch.py
 delete mode 100644 llm_perf/hardware/update_llm_perf_intel.py
 delete mode 100644 llm_perf/hardware/utils.py

diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
index 567128e5c..2597d7389 100644
--- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
@@ -53,4 +53,4 @@ jobs:
             pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
-            python llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py
+            python llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py
diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml
deleted file mode 100644
index 943d16d8f..000000000
--- a/.github/workflows/update_llm_perf_intel_openvino.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Update LLM Perf Benchmarks - Intel Openvino
-
-on:
-  workflow_dispatch:
-  push:
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
-
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
-
-jobs:
-  run_benchmarks:
-    strategy:
-      fail-fast: false
-      matrix:
-        backend: [pytorch, onnxruntime, openvino]
-        subset: [unquantized]
-        machine: [
-          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
-        ]
-
-    runs-on: ${{ matrix.machine.runs-on }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Run benchmarks
-        uses: addnab/docker-run-action@v3
-        env:
-          BACKEND: ${{ matrix.backend }}
-          SUBSET: ${{ matrix.subset }}
-          MACHINE: ${{ matrix.machine.name }}
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        with:
-          image: ${{ env.IMAGE }}
-          options: |
-            --rm
-            --shm-size 64G
-            --env BACKEND
-            --env SUBSET
-            --env MACHINE
-            --env HF_TOKEN
-            --env MKL_THREADING_LAYER=GNU
-            --env HF_HUB_ENABLE_HF_TRANSFER=1
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install packaging && pip install einops scipy optimum codecarbon
-            pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .[onnxruntime,openvino]
-            python llm_perf/hardware/intel/update_llm_perf_intel.py
diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml
deleted file mode 100644
index c972a8869..000000000
--- a/.github/workflows/update_llm_perf_intel_pytorch.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: Update LLM Perf Benchmarks - Intel PyTorch
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
-
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
-
-jobs:
-  run_benchmarks:
-    strategy:
-      fail-fast: false
-      matrix:
-        backend: [pytorch, onnxruntime, openvino]
-        subset: [unquantized]
-        machine: [
-          {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
-        ]
-
-    runs-on: ${{ matrix.machine.runs-on }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Run benchmarks
-        uses: addnab/docker-run-action@v3
-        env:
-          BACKEND: ${{ matrix.backend }}
-          SUBSET: ${{ matrix.subset }}
-          MACHINE: ${{ matrix.machine.name }}
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        with:
-          image: ${{ env.IMAGE }}
-          options: |
-            --rm
-            --shm-size 64G
-            --env BACKEND
-            --env SUBSET
-            --env MACHINE
-            --env HF_TOKEN
-            --env MKL_THREADING_LAYER=GNU
-            --env HF_HUB_ENABLE_HF_TRANSFER=1
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install packaging && pip install einops scipy optimum codecarbon
-            pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .[onnxruntime,openvino]
-            python llm_perf/hardware/intel/update_llm_perf_intel.py
diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml
index d4a6bfb56..ac7f85b2d 100644
--- a/llm_perf/hardware.yml
+++ b/llm_perf/hardware.yml
@@ -1,6 +1,7 @@
 - machine: 1xA10
   description: A10-24GB-150W 🖥️
-  hardware_type: cuda
+  hardware_provider: nvidia
+  hardware_type: gpu
   subsets:
     - unquantized
     - awq
@@ -11,7 +12,8 @@
 
 - machine: 1xA100
   description: A100-80GB-275W 🖥️
-  hardware_type: cuda
+  hardware_provider: nvidia
+  hardware_type: gpu
   subsets:
     - unquantized
     - awq
@@ -22,7 +24,8 @@
 
 - machine: 1xT4
   description: T4-16GB-70W 🖥️
-  hardware_type: cuda
+  hardware_provider: nvidia
+  hardware_type: gpu
   subsets:
     - unquantized
     - awq
@@ -33,7 +36,8 @@
 
 - machine: c7i
   description: 4th-Gen-Intel-Xeon-385W 🖥️
-  hardware_type: intel
+  hardware_provider: intel
+  hardware_type: cpu
   subsets:
     - unquantized
   backends:
diff --git a/llm_perf/hardware/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware/update_llm_perf_cuda_pytorch.py
deleted file mode 100644
index 8b65e1f5c..000000000
--- a/llm_perf/hardware/update_llm_perf_cuda_pytorch.py
+++ /dev/null
@@ -1,182 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-    is_benchmark_supported,
-)
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-HARDWARE = "cuda"
-
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"]
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-elif SUBSET == "bnb":
-    WEIGHTS_CONFIGS = {
-        # bnb
-        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
-        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
-    }
-elif SUBSET == "gptq":
-    WEIGHTS_CONFIGS = {
-        # gptq
-        "4bit-gptq-exllama-v1": {
-            "quant_scheme": "gptq",
-            "torch_dtype": "float16",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
-        },
-        "4bit-gptq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "gptq",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
-        },
-    }
-elif SUBSET == "awq":
-    WEIGHTS_CONFIGS = {
-        # awq
-        "4bit-awq-gemm": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemm"},
-        },
-        "4bit-awq-gemv": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemv"},
-        },
-        "4bit-awq-exllama-v1": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-        "4bit-awq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-    }
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cuda",
-        device_ids="0",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_cuda_pytorch(model, attn_implementation, weights_config)
diff --git a/llm_perf/hardware/update_llm_perf_intel.py b/llm_perf/hardware/update_llm_perf_intel.py
deleted file mode 100644
index 5c760e1bd..000000000
--- a/llm_perf/hardware/update_llm_perf_intel.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-    is_benchmark_supported,
-)
-from optimum_benchmark import (
-    Benchmark,
-    BenchmarkConfig,
-    BenchmarkReport,
-    InferenceConfig,
-    ORTConfig,
-    OVConfig,
-    ProcessConfig,
-    PyTorchConfig,
-)
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-BACKEND = os.getenv("BACKEND", None)
-HARDWARE = "intel"
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa"]
-
-
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-elif SUBSET == "bnb":
-    WEIGHTS_CONFIGS = {
-        # bnb
-        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
-        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
-    }
-elif SUBSET == "gptq":
-    WEIGHTS_CONFIGS = {
-        # gptq
-        "4bit-gptq-exllama-v1": {
-            "quant_scheme": "gptq",
-            "torch_dtype": "float16",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
-        },
-        "4bit-gptq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "gptq",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
-        },
-    }
-elif SUBSET == "awq":
-    WEIGHTS_CONFIGS = {
-        # awq
-        "4bit-awq-gemm": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemm"},
-        },
-        "4bit-awq-gemv": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemv"},
-        },
-        "4bit-awq-exllama-v1": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-        "4bit-awq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-    }
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def benchmark_intel(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig()
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-
-    if BACKEND == "pytorch":
-        backend_config = PyTorchConfig(
-            model=model,
-            device="cpu",
-            no_weights=True,
-            library="transformers",
-            task="text-generation",
-            torch_dtype=torch_dtype,
-            quantization_scheme=quant_scheme,
-            quantization_config=quant_config,
-            attn_implementation=attn_implementation,
-            model_kwargs={"trust_remote_code": True},
-        )
-    elif BACKEND == "onnxruntime":
-        backend_config = ORTConfig(
-            model=model,
-            device="cpu",
-            device_ids="0",
-            no_weights=True,
-            library="transformers",
-            task="text-generation",
-            torch_dtype=torch_dtype,
-            quantization_config=quant_config,
-            model_kwargs={"trust_remote_code": True},
-        )
-    elif BACKEND == "openvino":
-        backend_config = OVConfig(
-            model=model,
-            device="cpu",
-            device_ids="0",
-            no_weights=True,
-            library="transformers",
-            task="text-generation",
-            quantization_config=quant_config,
-            model_kwargs={"trust_remote_code": True},
-        )
-    else:
-        raise ValueError(f"Unsupported backend: {BACKEND}")
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_intel(model, attn_implementation, weights_config)
diff --git a/llm_perf/hardware/utils.py b/llm_perf/hardware/utils.py
deleted file mode 100644
index 71c9ce706..000000000
--- a/llm_perf/hardware/utils.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import Any, Dict, List
-
-import yaml
-
-
-class HardwareConfig:
-    def __init__(self, data: Dict[str, Any]):
-        self.machine = data["machine"]
-        self.description = data["description"]
-        self.hardware_type = data["hardware_type"]
-        self.subsets = data["subsets"]
-        self.backends = data["backends"]
-
-    def __repr__(self):
-        return (
-            f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
-            f"hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})"
-        )
-
-
-def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
-    with open(file_path, "r") as file:
-        data = yaml.safe_load(file)
-    return [HardwareConfig(config) for config in data]
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index e8308d59b..caaf91ec2 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -2,7 +2,7 @@
 from glob import glob
 
 import pandas as pd
-from hardware.utils import load_hardware_configs
+from utils import load_hardware_configs
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
 
diff --git a/llm_perf/utils.py b/llm_perf/utils.py
index ea25cdab3..8a5c43a83 100644
--- a/llm_perf/utils.py
+++ b/llm_perf/utils.py
@@ -1,4 +1,8 @@
+from enum import Enum, auto
+from typing import Any, Dict, List
+
 import pandas as pd
+import yaml
 
 from optimum_benchmark.benchmark.report import BenchmarkReport
 
@@ -146,3 +150,31 @@ def is_benchmark_supported(weights_config, attn_implementation, hardware):
             return False
 
     return True
+
+
+class HardwareType(Enum):
+    CPU = auto()
+    GPU = auto()
+
+
+class HardwareConfig:
+    def __init__(self, data: Dict[str, Any]):
+        self.machine = data["machine"]
+        self.description = data["description"]
+        self.hardware_provider = data["hardware provider"]
+        self.hardware_type = data["hardware type"]
+        assert self.hardware_type in HardwareType, f"Hardware type {self.hardware_type} not supported"
+        self.subsets = data["subsets"]
+        self.backends = data["backends"]
+
+    def __repr__(self):
+        return (
+            f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
+            f"hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})"
+        )
+
+
+def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
+    with open(file_path, "r") as file:
+        data = yaml.safe_load(file)
+    return [HardwareConfig(config) for config in data]

From 540af0a0e82788ab80d930c997034fcc13cc30b5 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 10:04:55 +0000
Subject: [PATCH 38/73] add new workflow

---
 llm_perf/update_llm_perf_leaderboard.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index caaf91ec2..af224c06d 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -2,9 +2,9 @@
 from glob import glob
 
 import pandas as pd
-from utils import load_hardware_configs
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
+from utils import load_hardware_configs
 
 from optimum_benchmark import Benchmark
 

From 452e4b0b365b7480fb8becfaffc6a9f7bce7c814 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 5 Sep 2024 10:06:53 +0000
Subject: [PATCH 39/73] add new workflow

---
 .../update_llm_perf_intel_pytorch.yaml        |  54 ++++++
 .../intel/update_llm_perf_intel_pytorch.py    | 141 ++++++++++++++
 .../nvidia/update_llm_perf_cuda_pytorch.py    | 182 ++++++++++++++++++
 3 files changed, 377 insertions(+)
 create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml
 create mode 100644 llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py
 create mode 100644 llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml
new file mode 100644
index 000000000..3cf6bd9bd
--- /dev/null
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml
@@ -0,0 +1,54 @@
+name: Update LLM Perf Benchmarks - Intel PyTorch
+
+on:
+  workflow_dispatch:
+  push:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+env:
+  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
+
+jobs:
+  run_benchmarks:
+    strategy:
+      fail-fast: false
+      matrix:
+        subset: [unquantized]
+        machine: [
+          {name: 32vCPU-C7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
+        ]
+
+    runs-on: ${{ matrix.machine.runs-on }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Run benchmarks
+        uses: addnab/docker-run-action@v3
+        env:
+          SUBSET: ${{ matrix.subset }}
+          MACHINE: ${{ matrix.machine.name }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        with:
+          image: ${{ env.IMAGE }}
+          options: |
+            --rm
+            --shm-size 64G
+            --env SUBSET
+            --env MACHINE
+            --env HF_TOKEN
+            --env MKL_THREADING_LAYER=GNU
+            --env HF_HUB_ENABLE_HF_TRANSFER=1
+            --volume ${{ github.workspace }}:/workspace
+            --workdir /workspace
+          run: |
+            pip install packaging && pip install einops scipy optimum codecarbon
+            pip install -U transformers huggingface_hub[hf_transfer]
+            pip install -e .[onnxruntime,openvino]
+            python llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py
diff --git a/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py
new file mode 100644
index 000000000..3b7dfe781
--- /dev/null
+++ b/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py
@@ -0,0 +1,141 @@
+import os
+import traceback
+from itertools import product
+from logging import getLogger
+
+from llm_perf.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    GENERATE_KWARGS,
+    INPUT_SHAPES,
+    OPEN_LLM_LIST,
+    PRETRAINED_OPEN_LLM_LIST,
+    is_benchmark_conducted,
+    is_benchmark_supported,
+)
+from optimum_benchmark import (
+    Benchmark,
+    BenchmarkConfig,
+    BenchmarkReport,
+    InferenceConfig,
+    ProcessConfig,
+    PyTorchConfig,
+)
+from optimum_benchmark.logging_utils import setup_logging
+
+SUBSET = os.getenv("SUBSET", None)
+MACHINE = os.getenv("MACHINE", None)
+BACKEND = "pytorch"
+HARDWARE = "intel"
+
+if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug"
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
+    SUBSET = "unquantized"
+elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}"
+else:
+    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
+
+ATTENTION_CONFIGS = ["eager", "sdpa"]
+
+
+if SUBSET == "unquantized":
+    WEIGHTS_CONFIGS = {
+        # unquantized
+        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+    }
+else:
+    raise ValueError(f"Subset {SUBSET} not supported")
+
+
+LOGGER = getLogger("llm-perf-backend")
+LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
+LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
+LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
+
+
+def benchmark_intel(model, attn_implementation, weights_config):
+    benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
+    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
+    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
+    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
+
+    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
+        return
+
+    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
+        return
+
+    launcher_config = ProcessConfig()
+    scenario_config = InferenceConfig(
+        memory=True,
+        energy=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes=INPUT_SHAPES,
+        generate_kwargs=GENERATE_KWARGS,
+    )
+
+    backend_config = PyTorchConfig(
+        model=model,
+        device="cpu",
+        no_weights=True,
+        library="transformers",
+        task="text-generation",
+        torch_dtype=torch_dtype,
+        quantization_scheme=quant_scheme,
+        quantization_config=quant_config,
+        attn_implementation=attn_implementation,
+        model_kwargs={"trust_remote_code": True},
+    )
+
+    benchmark_config = BenchmarkConfig(
+        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
+    )
+
+    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    try:
+        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
+        benchmark_report = Benchmark.launch(benchmark_config)
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    except Exception:
+        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
+        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+
+if __name__ == "__main__":
+    # for isolated process
+    os.environ["LOG_TO_FILE"] = "0"
+    os.environ["LOG_LEVEL"] = "INFO"
+
+    # for main process
+    setup_logging(level="INFO", prefix="MAIN-PROCESS")
+
+    models_attentions_weights = list(
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
+    )
+
+    LOGGER.info(
+        f"Running a total of {len(models_attentions_weights)} benchmarks, "
+        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
+        f"{len(ATTENTION_CONFIGS)} attentions implementations "
+        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
+    )
+
+    for model, attn_implementation, weights_config in models_attentions_weights:
+        benchmark_intel(model, attn_implementation, weights_config)
diff --git a/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py
new file mode 100644
index 000000000..8b65e1f5c
--- /dev/null
+++ b/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py
@@ -0,0 +1,182 @@
+import os
+import traceback
+from itertools import product
+from logging import getLogger
+
+from llm_perf.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    GENERATE_KWARGS,
+    INPUT_SHAPES,
+    OPEN_LLM_LIST,
+    PRETRAINED_OPEN_LLM_LIST,
+    is_benchmark_conducted,
+    is_benchmark_supported,
+)
+from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+SUBSET = os.getenv("SUBSET", None)
+MACHINE = os.getenv("MACHINE", None)
+HARDWARE = "cuda"
+
+
+if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
+    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug"
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
+    SUBSET = "unquantized"
+elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}"
+else:
+    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
+
+ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"]
+if SUBSET == "unquantized":
+    WEIGHTS_CONFIGS = {
+        # unquantized
+        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+    }
+elif SUBSET == "bnb":
+    WEIGHTS_CONFIGS = {
+        # bnb
+        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
+        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
+    }
+elif SUBSET == "gptq":
+    WEIGHTS_CONFIGS = {
+        # gptq
+        "4bit-gptq-exllama-v1": {
+            "quant_scheme": "gptq",
+            "torch_dtype": "float16",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
+        },
+        "4bit-gptq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "gptq",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
+        },
+    }
+elif SUBSET == "awq":
+    WEIGHTS_CONFIGS = {
+        # awq
+        "4bit-awq-gemm": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemm"},
+        },
+        "4bit-awq-gemv": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemv"},
+        },
+        "4bit-awq-exllama-v1": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+        "4bit-awq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+    }
+
+
+LOGGER = getLogger("llm-perf-backend")
+LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
+LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
+LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
+
+
+def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
+    benchmark_name = f"{weights_config}-{attn_implementation}"
+    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
+    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
+    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
+
+    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
+        return
+
+    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
+        return
+
+    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
+    scenario_config = InferenceConfig(
+        memory=True,
+        energy=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes=INPUT_SHAPES,
+        generate_kwargs=GENERATE_KWARGS,
+    )
+    backend_config = PyTorchConfig(
+        model=model,
+        device="cuda",
+        device_ids="0",
+        no_weights=True,
+        library="transformers",
+        task="text-generation",
+        torch_dtype=torch_dtype,
+        quantization_scheme=quant_scheme,
+        quantization_config=quant_config,
+        attn_implementation=attn_implementation,
+        model_kwargs={"trust_remote_code": True},
+    )
+
+    benchmark_config = BenchmarkConfig(
+        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
+    )
+
+    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    try:
+        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
+        benchmark_report = Benchmark.launch(benchmark_config)
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    except Exception:
+        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
+        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+
+if __name__ == "__main__":
+    # for isolated process
+    os.environ["LOG_TO_FILE"] = "0"
+    os.environ["LOG_LEVEL"] = "INFO"
+
+    # for main process
+    setup_logging(level="INFO", prefix="MAIN-PROCESS")
+
+    models_attentions_weights = list(
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
+    )
+
+    LOGGER.info(
+        f"Running a total of {len(models_attentions_weights)} benchmarks, "
+        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
+        f"{len(ATTENTION_CONFIGS)} attentions implementations "
+        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
+    )
+
+    for model, attn_implementation, weights_config in models_attentions_weights:
+        benchmark_cuda_pytorch(model, attn_implementation, weights_config)

From b25d6e1196eb8e7668eaf150d5064bf0dfc380f7 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Fri, 6 Sep 2024 13:35:58 +0000
Subject: [PATCH 40/73] add new workflow

---
 .../update_llm_perf_cuda_pytorch.yaml         |   2 +-
 .../update_llm_perf_intel_pytorch.yaml        |   2 +-
 .../intel/update_llm_perf_intel_pytorch.py    | 141 --------------
 .../nvidia/update_llm_perf_cuda_pytorch.py    | 182 ------------------
 llm_perf/update_llm_perf_leaderboard.py       |  15 +-
 5 files changed, 12 insertions(+), 330 deletions(-)
 delete mode 100644 llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py
 delete mode 100644 llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py

diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
index 2597d7389..78b2f1f2c 100644
--- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
@@ -53,4 +53,4 @@ jobs:
             pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
-            python llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py
+            python llm_perf/scripts/update_llm_perf_cuda_pytorch.py
diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml
index 3cf6bd9bd..9b44ab711 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml
@@ -51,4 +51,4 @@ jobs:
             pip install packaging && pip install einops scipy optimum codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .[onnxruntime,openvino]
-            python llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py
+            python llm_perf/scripts/update_llm_perf_cpu_pytorch.py
diff --git a/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py
deleted file mode 100644
index 3b7dfe781..000000000
--- a/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-    is_benchmark_supported,
-)
-from optimum_benchmark import (
-    Benchmark,
-    BenchmarkConfig,
-    BenchmarkReport,
-    InferenceConfig,
-    ProcessConfig,
-    PyTorchConfig,
-)
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-BACKEND = "pytorch"
-HARDWARE = "intel"
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa"]
-
-
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-else:
-    raise ValueError(f"Subset {SUBSET} not supported")
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def benchmark_intel(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig()
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cpu",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_intel(model, attn_implementation, weights_config)
diff --git a/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py
deleted file mode 100644
index 8b65e1f5c..000000000
--- a/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py
+++ /dev/null
@@ -1,182 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-    is_benchmark_supported,
-)
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-HARDWARE = "cuda"
-
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"]
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-elif SUBSET == "bnb":
-    WEIGHTS_CONFIGS = {
-        # bnb
-        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
-        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
-    }
-elif SUBSET == "gptq":
-    WEIGHTS_CONFIGS = {
-        # gptq
-        "4bit-gptq-exllama-v1": {
-            "quant_scheme": "gptq",
-            "torch_dtype": "float16",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
-        },
-        "4bit-gptq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "gptq",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
-        },
-    }
-elif SUBSET == "awq":
-    WEIGHTS_CONFIGS = {
-        # awq
-        "4bit-awq-gemm": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemm"},
-        },
-        "4bit-awq-gemv": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemv"},
-        },
-        "4bit-awq-exllama-v1": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-        "4bit-awq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-    }
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cuda",
-        device_ids="0",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_cuda_pytorch(model, attn_implementation, weights_config)
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index af224c06d..26f054d08 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -10,17 +10,22 @@
 
 REPO_TYPE = "dataset"
 MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
-PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_type}-{subset}-{machine}"
+PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}"
 
 PERF_DF = "perf-df-{subset}-{machine}.csv"
 LLM_DF = "llm-df.csv"
 
 
-def gather_benchmarks(subset: str, machine: str, backend: str, hardware_type: str):
+def gather_benchmarks(subset: str, machine: str, backend: str, hardware_provider: str):
     """
     Gather the benchmarks for a given machine
     """
-    perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_type=hardware_type)
+    if hardware_provider == "nvidia":
+        hardware = "cuda"
+    else:
+        hardware = hardware_provider
+
+    perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware=hardware)
     snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"])
 
     dfs = []
@@ -44,10 +49,10 @@ def update_perf_dfs():
         for subset in hardware_config.subsets:
             for backend in hardware_config.backends:
                 try:
-                    gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type)
+                    gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_provider)
                 except Exception as e:
                     print(
-                        f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_type} and {subset} with {backend}: {e}"
+                        f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_provider} and {subset} with {backend}: {e}"
                     )
 
 

From a76e56dc196ab25533157cf90b377f617e7b6719 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Fri, 6 Sep 2024 13:37:56 +0000
Subject: [PATCH 41/73] add new workflow

---
 .../scripts/update_llm_perf_cpu_pytorch.py    | 141 ++++++++++++++
 .../scripts/update_llm_perf_cuda_pytorch.py   | 182 ++++++++++++++++++
 2 files changed, 323 insertions(+)
 create mode 100644 llm_perf/scripts/update_llm_perf_cpu_pytorch.py
 create mode 100644 llm_perf/scripts/update_llm_perf_cuda_pytorch.py

diff --git a/llm_perf/scripts/update_llm_perf_cpu_pytorch.py b/llm_perf/scripts/update_llm_perf_cpu_pytorch.py
new file mode 100644
index 000000000..3b7dfe781
--- /dev/null
+++ b/llm_perf/scripts/update_llm_perf_cpu_pytorch.py
@@ -0,0 +1,141 @@
+import os
+import traceback
+from itertools import product
+from logging import getLogger
+
+from llm_perf.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    GENERATE_KWARGS,
+    INPUT_SHAPES,
+    OPEN_LLM_LIST,
+    PRETRAINED_OPEN_LLM_LIST,
+    is_benchmark_conducted,
+    is_benchmark_supported,
+)
+from optimum_benchmark import (
+    Benchmark,
+    BenchmarkConfig,
+    BenchmarkReport,
+    InferenceConfig,
+    ProcessConfig,
+    PyTorchConfig,
+)
+from optimum_benchmark.logging_utils import setup_logging
+
+SUBSET = os.getenv("SUBSET", None)
+MACHINE = os.getenv("MACHINE", None)
+BACKEND = "pytorch"
+HARDWARE = "intel"
+
+if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug"
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
+    SUBSET = "unquantized"
+elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}"
+else:
+    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
+
+ATTENTION_CONFIGS = ["eager", "sdpa"]
+
+
+if SUBSET == "unquantized":
+    WEIGHTS_CONFIGS = {
+        # unquantized
+        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+    }
+else:
+    raise ValueError(f"Subset {SUBSET} not supported")
+
+
+LOGGER = getLogger("llm-perf-backend")
+LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
+LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
+LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
+
+
+def benchmark_intel(model, attn_implementation, weights_config):
+    benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
+    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
+    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
+    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
+
+    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
+        return
+
+    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
+        return
+
+    launcher_config = ProcessConfig()
+    scenario_config = InferenceConfig(
+        memory=True,
+        energy=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes=INPUT_SHAPES,
+        generate_kwargs=GENERATE_KWARGS,
+    )
+
+    backend_config = PyTorchConfig(
+        model=model,
+        device="cpu",
+        no_weights=True,
+        library="transformers",
+        task="text-generation",
+        torch_dtype=torch_dtype,
+        quantization_scheme=quant_scheme,
+        quantization_config=quant_config,
+        attn_implementation=attn_implementation,
+        model_kwargs={"trust_remote_code": True},
+    )
+
+    benchmark_config = BenchmarkConfig(
+        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
+    )
+
+    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    try:
+        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
+        benchmark_report = Benchmark.launch(benchmark_config)
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    except Exception:
+        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
+        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+
+if __name__ == "__main__":
+    # for isolated process
+    os.environ["LOG_TO_FILE"] = "0"
+    os.environ["LOG_LEVEL"] = "INFO"
+
+    # for main process
+    setup_logging(level="INFO", prefix="MAIN-PROCESS")
+
+    models_attentions_weights = list(
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
+    )
+
+    LOGGER.info(
+        f"Running a total of {len(models_attentions_weights)} benchmarks, "
+        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
+        f"{len(ATTENTION_CONFIGS)} attentions implementations "
+        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
+    )
+
+    for model, attn_implementation, weights_config in models_attentions_weights:
+        benchmark_intel(model, attn_implementation, weights_config)
diff --git a/llm_perf/scripts/update_llm_perf_cuda_pytorch.py b/llm_perf/scripts/update_llm_perf_cuda_pytorch.py
new file mode 100644
index 000000000..8b65e1f5c
--- /dev/null
+++ b/llm_perf/scripts/update_llm_perf_cuda_pytorch.py
@@ -0,0 +1,182 @@
+import os
+import traceback
+from itertools import product
+from logging import getLogger
+
+from llm_perf.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    GENERATE_KWARGS,
+    INPUT_SHAPES,
+    OPEN_LLM_LIST,
+    PRETRAINED_OPEN_LLM_LIST,
+    is_benchmark_conducted,
+    is_benchmark_supported,
+)
+from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+SUBSET = os.getenv("SUBSET", None)
+MACHINE = os.getenv("MACHINE", None)
+HARDWARE = "cuda"
+
+
+if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
+    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug"
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
+    SUBSET = "unquantized"
+elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}"
+else:
+    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
+
+ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"]
+if SUBSET == "unquantized":
+    WEIGHTS_CONFIGS = {
+        # unquantized
+        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+    }
+elif SUBSET == "bnb":
+    WEIGHTS_CONFIGS = {
+        # bnb
+        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
+        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
+    }
+elif SUBSET == "gptq":
+    WEIGHTS_CONFIGS = {
+        # gptq
+        "4bit-gptq-exllama-v1": {
+            "quant_scheme": "gptq",
+            "torch_dtype": "float16",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
+        },
+        "4bit-gptq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "gptq",
+            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
+        },
+    }
+elif SUBSET == "awq":
+    WEIGHTS_CONFIGS = {
+        # awq
+        "4bit-awq-gemm": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemm"},
+        },
+        "4bit-awq-gemv": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {"bits": 4, "version": "gemv"},
+        },
+        "4bit-awq-exllama-v1": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+        "4bit-awq-exllama-v2": {
+            "torch_dtype": "float16",
+            "quant_scheme": "awq",
+            "quant_config": {
+                "bits": 4,
+                "version": "exllama",
+                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
+            },
+        },
+    }
+
+
+LOGGER = getLogger("llm-perf-backend")
+LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
+LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
+LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
+
+
+def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
+    benchmark_name = f"{weights_config}-{attn_implementation}"
+    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
+    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
+    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
+
+    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
+        return
+
+    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
+        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
+        return
+
+    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
+    scenario_config = InferenceConfig(
+        memory=True,
+        energy=True,
+        latency=True,
+        duration=10,
+        iterations=10,
+        warmup_runs=10,
+        input_shapes=INPUT_SHAPES,
+        generate_kwargs=GENERATE_KWARGS,
+    )
+    backend_config = PyTorchConfig(
+        model=model,
+        device="cuda",
+        device_ids="0",
+        no_weights=True,
+        library="transformers",
+        task="text-generation",
+        torch_dtype=torch_dtype,
+        quantization_scheme=quant_scheme,
+        quantization_config=quant_config,
+        attn_implementation=attn_implementation,
+        model_kwargs={"trust_remote_code": True},
+    )
+
+    benchmark_config = BenchmarkConfig(
+        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
+    )
+
+    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    try:
+        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
+        benchmark_report = Benchmark.launch(benchmark_config)
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+    except Exception:
+        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
+        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
+
+
+if __name__ == "__main__":
+    # for isolated process
+    os.environ["LOG_TO_FILE"] = "0"
+    os.environ["LOG_LEVEL"] = "INFO"
+
+    # for main process
+    setup_logging(level="INFO", prefix="MAIN-PROCESS")
+
+    models_attentions_weights = list(
+        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
+    )
+
+    LOGGER.info(
+        f"Running a total of {len(models_attentions_weights)} benchmarks, "
+        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
+        f"{len(ATTENTION_CONFIGS)} attentions implementations "
+        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
+    )
+
+    for model, attn_implementation, weights_config in models_attentions_weights:
+        benchmark_cuda_pytorch(model, attn_implementation, weights_config)

From 6677def85c23af7eb3644c1e48b00df36d83a010 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Fri, 6 Sep 2024 14:03:53 +0000
Subject: [PATCH 42/73] add new workflow

---
 llm_perf/hardware.yml                                 |  8 ++++----
 llm_perf/{scripts => }/update_llm_perf_cpu_pytorch.py |  0
 .../{scripts => }/update_llm_perf_cuda_pytorch.py     |  0
 llm_perf/update_llm_perf_leaderboard.py               | 11 +++--------
 4 files changed, 7 insertions(+), 12 deletions(-)
 rename llm_perf/{scripts => }/update_llm_perf_cpu_pytorch.py (100%)
 rename llm_perf/{scripts => }/update_llm_perf_cuda_pytorch.py (100%)

diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml
index ac7f85b2d..49819a860 100644
--- a/llm_perf/hardware.yml
+++ b/llm_perf/hardware.yml
@@ -1,7 +1,7 @@
 - machine: 1xA10
   description: A10-24GB-150W 🖥️
   hardware_provider: nvidia
-  hardware_type: gpu
+  hardware_backend: cuda
   subsets:
     - unquantized
     - awq
@@ -13,7 +13,7 @@
 - machine: 1xA100
   description: A100-80GB-275W 🖥️
   hardware_provider: nvidia
-  hardware_type: gpu
+  hardware_backend: cuda
   subsets:
     - unquantized
     - awq
@@ -25,7 +25,7 @@
 - machine: 1xT4
   description: T4-16GB-70W 🖥️
   hardware_provider: nvidia
-  hardware_type: gpu
+  hardware_backend: cuda
   subsets:
     - unquantized
     - awq
@@ -37,7 +37,7 @@
 - machine: c7i
   description: 4th-Gen-Intel-Xeon-385W 🖥️
   hardware_provider: intel
-  hardware_type: cpu
+  hardware_backend: cpu
   subsets:
     - unquantized
   backends:
diff --git a/llm_perf/scripts/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py
similarity index 100%
rename from llm_perf/scripts/update_llm_perf_cpu_pytorch.py
rename to llm_perf/update_llm_perf_cpu_pytorch.py
diff --git a/llm_perf/scripts/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py
similarity index 100%
rename from llm_perf/scripts/update_llm_perf_cuda_pytorch.py
rename to llm_perf/update_llm_perf_cuda_pytorch.py
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 26f054d08..84461f45b 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -10,22 +10,17 @@
 
 REPO_TYPE = "dataset"
 MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
-PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}"
+PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_backend}-{subset}-{machine}"
 
 PERF_DF = "perf-df-{subset}-{machine}.csv"
 LLM_DF = "llm-df.csv"
 
 
-def gather_benchmarks(subset: str, machine: str, backend: str, hardware_provider: str):
+def gather_benchmarks(subset: str, machine: str, backend: str, hardware_backend: str):
     """
     Gather the benchmarks for a given machine
     """
-    if hardware_provider == "nvidia":
-        hardware = "cuda"
-    else:
-        hardware = hardware_provider
-
-    perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware=hardware)
+    perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_backend=hardware_backend)
     snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"])
 
     dfs = []

From 6593487c67b3159ad3f7a0b7bd655a05b5f0849d Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Fri, 6 Sep 2024 14:04:39 +0000
Subject: [PATCH 43/73] add new workflow

---
 .github/workflows/update_llm_perf_cuda_pytorch.yaml  | 2 +-
 .github/workflows/update_llm_perf_intel_pytorch.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
index 78b2f1f2c..0ab646ab9 100644
--- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
@@ -53,4 +53,4 @@ jobs:
             pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
-            python llm_perf/scripts/update_llm_perf_cuda_pytorch.py
+            python llm_perf/update_llm_perf_cuda_pytorch.py
diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml
index 9b44ab711..6032182f2 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml
@@ -51,4 +51,4 @@ jobs:
             pip install packaging && pip install einops scipy optimum codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .[onnxruntime,openvino]
-            python llm_perf/scripts/update_llm_perf_cpu_pytorch.py
+            python llm_perf/update_llm_perf_cpu_pytorch.py

From 9802c95e20e85866c10db23d9f56301679a477ae Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Fri, 6 Sep 2024 14:09:00 +0000
Subject: [PATCH 44/73] add new workflow

---
 llm_perf/update_llm_perf_cpu_pytorch.py  |  8 +++++++-
 llm_perf/update_llm_perf_cuda_pytorch.py | 12 ++++++++----
 llm_perf/update_llm_perf_leaderboard.py  |  4 +++-
 llm_perf/utils.py                        | 11 -----------
 4 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py
index 3b7dfe781..1e8cce1bc 100644
--- a/llm_perf/update_llm_perf_cpu_pytorch.py
+++ b/llm_perf/update_llm_perf_cpu_pytorch.py
@@ -10,7 +10,6 @@
     OPEN_LLM_LIST,
     PRETRAINED_OPEN_LLM_LIST,
     is_benchmark_conducted,
-    is_benchmark_supported,
 )
 from optimum_benchmark import (
     Benchmark,
@@ -56,6 +55,13 @@
 LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
 
 
+def is_benchmark_supported(weights_config, attn_implementation, hardware):
+    if attn_implementation == "flash_attention_2":
+        return False
+
+    return True
+
+
 def benchmark_intel(model, attn_implementation, weights_config):
     benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
     subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py
index 8b65e1f5c..3a216aca6 100644
--- a/llm_perf/update_llm_perf_cuda_pytorch.py
+++ b/llm_perf/update_llm_perf_cuda_pytorch.py
@@ -10,15 +10,12 @@
     OPEN_LLM_LIST,
     PRETRAINED_OPEN_LLM_LIST,
     is_benchmark_conducted,
-    is_benchmark_supported,
 )
 from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
 from optimum_benchmark.logging_utils import setup_logging
 
 SUBSET = os.getenv("SUBSET", None)
 MACHINE = os.getenv("MACHINE", None)
-HARDWARE = "cuda"
-
 
 if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
     PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug"
@@ -97,6 +94,13 @@
 LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
 
 
+def is_benchmark_supported(weights_config, attn_implementation):
+    if attn_implementation == "flash_attention_2" and weights_config == "float32":
+        return False
+
+    return True
+
+
 def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
     benchmark_name = f"{weights_config}-{attn_implementation}"
     subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
@@ -105,7 +109,7 @@ def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
     quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
     quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
 
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
+    if not is_benchmark_supported(weights_config, attn_implementation):
         LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
         return
 
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 84461f45b..df45288f5 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -20,7 +20,9 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware_backend:
     """
     Gather the benchmarks for a given machine
     """
-    perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_backend=hardware_backend)
+    perf_repo_id = PERF_REPO_ID.format(
+        subset=subset, machine=machine, backend=backend, hardware_backend=hardware_backend
+    )
     snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"])
 
     dfs = []
diff --git a/llm_perf/utils.py b/llm_perf/utils.py
index 8a5c43a83..94cacc7c0 100644
--- a/llm_perf/utils.py
+++ b/llm_perf/utils.py
@@ -141,17 +141,6 @@ def is_benchmark_conducted(push_repo_id, subfolder):
         return False
 
 
-def is_benchmark_supported(weights_config, attn_implementation, hardware):
-    if hardware == "cuda":
-        if attn_implementation == "flash_attention_2" and weights_config == "float32":
-            return False
-    elif hardware == "intel":
-        if attn_implementation == "flash_attention_2":
-            return False
-
-    return True
-
-
 class HardwareType(Enum):
     CPU = auto()
     GPU = auto()

From b6b947fdba1f9a7350c531633528700b0acf0c29 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Fri, 6 Sep 2024 14:11:29 +0000
Subject: [PATCH 45/73] add new workflow

---
 llm_perf/utils.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/llm_perf/utils.py b/llm_perf/utils.py
index 94cacc7c0..8b0d78588 100644
--- a/llm_perf/utils.py
+++ b/llm_perf/utils.py
@@ -140,26 +140,19 @@ def is_benchmark_conducted(push_repo_id, subfolder):
     except Exception:
         return False
 
-
-class HardwareType(Enum):
-    CPU = auto()
-    GPU = auto()
-
-
 class HardwareConfig:
     def __init__(self, data: Dict[str, Any]):
         self.machine = data["machine"]
         self.description = data["description"]
         self.hardware_provider = data["hardware provider"]
-        self.hardware_type = data["hardware type"]
-        assert self.hardware_type in HardwareType, f"Hardware type {self.hardware_type} not supported"
+        self.hardware_backend = data["hardware_backend type"]
         self.subsets = data["subsets"]
         self.backends = data["backends"]
 
     def __repr__(self):
         return (
             f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
-            f"hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})"
+            f"hardware_type={self.hardware_backend}, subsets={self.subsets}, backends={self.backends})"
         )
 
 

From 7a891c1f0b9888a5d5a942dfeadfecaefb75ad2a Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Fri, 6 Sep 2024 14:13:32 +0000
Subject: [PATCH 46/73] add new workflow

---
 llm_perf/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_perf/utils.py b/llm_perf/utils.py
index 8b0d78588..aa4b961e5 100644
--- a/llm_perf/utils.py
+++ b/llm_perf/utils.py
@@ -1,4 +1,3 @@
-from enum import Enum, auto
 from typing import Any, Dict, List
 
 import pandas as pd
@@ -140,6 +139,7 @@ def is_benchmark_conducted(push_repo_id, subfolder):
     except Exception:
         return False
 
+
 class HardwareConfig:
     def __init__(self, data: Dict[str, Any]):
         self.machine = data["machine"]

From a6f289bbbc920b2f78efa129a08e33c8214f6353 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 12:23:36 +0000
Subject: [PATCH 47/73] remove intel reference

---
 .github/workflows/update_llm_perf_intel_pytorch.yaml |  2 +-
 llm_perf/update_llm_perf_cpu_pytorch.py              | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml
index 6032182f2..3632d5412 100644
--- a/.github/workflows/update_llm_perf_intel_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml
@@ -50,5 +50,5 @@ jobs:
           run: |
             pip install packaging && pip install einops scipy optimum codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .[onnxruntime,openvino]
+            pip install -e .
             python llm_perf/update_llm_perf_cpu_pytorch.py
diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py
index 1e8cce1bc..def443cfe 100644
--- a/llm_perf/update_llm_perf_cpu_pytorch.py
+++ b/llm_perf/update_llm_perf_cpu_pytorch.py
@@ -24,14 +24,14 @@
 SUBSET = os.getenv("SUBSET", None)
 MACHINE = os.getenv("MACHINE", None)
 BACKEND = "pytorch"
-HARDWARE = "intel"
+HARDWARE = "cpu"
 
 if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug"
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug"
     CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
     SUBSET = "unquantized"
 elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}"
+    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}"
 else:
     raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
 
@@ -62,7 +62,7 @@ def is_benchmark_supported(weights_config, attn_implementation, hardware):
     return True
 
 
-def benchmark_intel(model, attn_implementation, weights_config):
+def benchmark_cpu_pytorch(model, attn_implementation, weights_config):
     benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
     subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
 
@@ -144,4 +144,4 @@ def benchmark_intel(model, attn_implementation, weights_config):
     )
 
     for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_intel(model, attn_implementation, weights_config)
+        benchmark_cpu_pytorch(model, attn_implementation, weights_config)

From e97ee56109f239c5c050439992a7bf391ebbb1b1 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 12:25:02 +0000
Subject: [PATCH 48/73] remove intel reference

---
 .../update_llm_perf_intel_pytorch.yaml        | 54 -------------------
 1 file changed, 54 deletions(-)
 delete mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml

diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml
deleted file mode 100644
index 3632d5412..000000000
--- a/.github/workflows/update_llm_perf_intel_pytorch.yaml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: Update LLM Perf Benchmarks - Intel PyTorch
-
-on:
-  workflow_dispatch:
-  push:
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
-
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
-
-jobs:
-  run_benchmarks:
-    strategy:
-      fail-fast: false
-      matrix:
-        subset: [unquantized]
-        machine: [
-          {name: 32vCPU-C7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
-        ]
-
-    runs-on: ${{ matrix.machine.runs-on }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Run benchmarks
-        uses: addnab/docker-run-action@v3
-        env:
-          SUBSET: ${{ matrix.subset }}
-          MACHINE: ${{ matrix.machine.name }}
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        with:
-          image: ${{ env.IMAGE }}
-          options: |
-            --rm
-            --shm-size 64G
-            --env SUBSET
-            --env MACHINE
-            --env HF_TOKEN
-            --env MKL_THREADING_LAYER=GNU
-            --env HF_HUB_ENABLE_HF_TRANSFER=1
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install packaging && pip install einops scipy optimum codecarbon
-            pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .
-            python llm_perf/update_llm_perf_cpu_pytorch.py

From f5f0eebce2e76fa7d23ab7f7aa9a851d9b5c34a5 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 12:25:09 +0000
Subject: [PATCH 49/73] remove intel reference

---
 .../update_llm_perf_cpu_pytorch.yaml          | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 .github/workflows/update_llm_perf_cpu_pytorch.yaml

diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
new file mode 100644
index 000000000..3632d5412
--- /dev/null
+++ b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
@@ -0,0 +1,54 @@
+name: Update LLM Perf Benchmarks - Intel PyTorch
+
+on:
+  workflow_dispatch:
+  push:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+env:
+  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
+
+jobs:
+  run_benchmarks:
+    strategy:
+      fail-fast: false
+      matrix:
+        subset: [unquantized]
+        machine: [
+          {name: 32vCPU-C7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
+        ]
+
+    runs-on: ${{ matrix.machine.runs-on }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Run benchmarks
+        uses: addnab/docker-run-action@v3
+        env:
+          SUBSET: ${{ matrix.subset }}
+          MACHINE: ${{ matrix.machine.name }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        with:
+          image: ${{ env.IMAGE }}
+          options: |
+            --rm
+            --shm-size 64G
+            --env SUBSET
+            --env MACHINE
+            --env HF_TOKEN
+            --env MKL_THREADING_LAYER=GNU
+            --env HF_HUB_ENABLE_HF_TRANSFER=1
+            --volume ${{ github.workspace }}:/workspace
+            --workdir /workspace
+          run: |
+            pip install packaging && pip install einops scipy optimum codecarbon
+            pip install -U transformers huggingface_hub[hf_transfer]
+            pip install -e .
+            python llm_perf/update_llm_perf_cpu_pytorch.py

From 55e2c69014badb8df5e6d57f57399fe5b22e587b Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 13:34:39 +0000
Subject: [PATCH 50/73] refractoring done

---
 .../update_llm_perf_cpu_pytorch.yaml          |  54 -----
 .../update_llm_perf_cuda_pytorch.yaml         |   2 +-
 .../update_llm_perf_cuda_pytorch.py           | 126 ++++++++++++
 llm_perf/common/benchmark_runner.py           | 116 +++++++++++
 llm_perf/common/hardware_config.py            |  25 +++
 llm_perf/{ => common}/utils.py                |  38 ----
 llm_perf/hardware.yml                         |  13 +-
 llm_perf/update_llm_perf_cpu_pytorch.py       | 147 --------------
 llm_perf/update_llm_perf_cuda_pytorch.py      | 186 ------------------
 llm_perf/update_llm_perf_leaderboard.py       |   2 +-
 10 files changed, 270 insertions(+), 439 deletions(-)
 delete mode 100644 .github/workflows/update_llm_perf_cpu_pytorch.yaml
 create mode 100644 llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
 create mode 100644 llm_perf/common/benchmark_runner.py
 create mode 100644 llm_perf/common/hardware_config.py
 rename llm_perf/{ => common}/utils.py (75%)
 delete mode 100644 llm_perf/update_llm_perf_cpu_pytorch.py
 delete mode 100644 llm_perf/update_llm_perf_cuda_pytorch.py

diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
deleted file mode 100644
index 3632d5412..000000000
--- a/.github/workflows/update_llm_perf_cpu_pytorch.yaml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: Update LLM Perf Benchmarks - Intel PyTorch
-
-on:
-  workflow_dispatch:
-  push:
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
-
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
-
-jobs:
-  run_benchmarks:
-    strategy:
-      fail-fast: false
-      matrix:
-        subset: [unquantized]
-        machine: [
-          {name: 32vCPU-C7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, 
-        ]
-
-    runs-on: ${{ matrix.machine.runs-on }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Run benchmarks
-        uses: addnab/docker-run-action@v3
-        env:
-          SUBSET: ${{ matrix.subset }}
-          MACHINE: ${{ matrix.machine.name }}
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        with:
-          image: ${{ env.IMAGE }}
-          options: |
-            --rm
-            --shm-size 64G
-            --env SUBSET
-            --env MACHINE
-            --env HF_TOKEN
-            --env MKL_THREADING_LAYER=GNU
-            --env HF_HUB_ENABLE_HF_TRANSFER=1
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install packaging && pip install einops scipy optimum codecarbon
-            pip install -U transformers huggingface_hub[hf_transfer]
-            pip install -e .
-            python llm_perf/update_llm_perf_cpu_pytorch.py
diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
index 0ab646ab9..658e63fd1 100644
--- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
@@ -53,4 +53,4 @@ jobs:
             pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
-            python llm_perf/update_llm_perf_cuda_pytorch.py
+            python llm_perf/benchmarks/update_llm_perf_cuda_pytorch.py
diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
new file mode 100644
index 000000000..f22b28eb0
--- /dev/null
+++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
@@ -0,0 +1,126 @@
+from typing import Any, Dict, List
+
+from llm_perf.common.benchmark_runner import BenchmarkRunner
+from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES
+from optimum_benchmark import PyTorchConfig
+from optimum_benchmark.benchmark.config import BenchmarkConfig
+from optimum_benchmark.launchers.process.config import ProcessConfig
+from optimum_benchmark.scenarios.inference.config import InferenceConfig
+
+
+class CUDAPyTorchBenchmarkRunner(BenchmarkRunner):
+    def __init__(self):
+        super().__init__(backend="pytorch", hardware="cuda")
+
+    def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool:
+        if attn_implementation == "flash_attention_2" and weights_config == "float32":
+            return False
+        return True
+
+    def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig:
+        assert (
+            weights_config in self.weights_configs
+        ), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue"
+
+        torch_dtype = self.weights_configs[weights_config]["torch_dtype"]
+        quant_scheme = self.weights_configs[weights_config]["quant_scheme"]
+        quant_config = self.weights_configs[weights_config]["quant_config"]
+
+        launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
+        scenario_config = InferenceConfig(
+            memory=True,
+            energy=True,
+            latency=True,
+            duration=10,
+            iterations=10,
+            warmup_runs=10,
+            input_shapes=INPUT_SHAPES,
+            generate_kwargs=GENERATE_KWARGS,
+        )
+        backend_config = PyTorchConfig(
+            model=model,
+            device="cuda",
+            device_ids="0",
+            no_weights=True,
+            library="transformers",
+            task="text-generation",
+            torch_dtype=torch_dtype,
+            quantization_scheme=quant_scheme,
+            quantization_config=quant_config,
+            attn_implementation=attn_implementation,
+            model_kwargs={"trust_remote_code": True},
+        )
+
+        return BenchmarkConfig(
+            name=f"{weights_config}-{attn_implementation}",
+            scenario=scenario_config,
+            launcher=launcher_config,
+            backend=backend_config,
+        )
+
+    def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]:
+        if subset == "unquantized":
+            return {
+                "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+                "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+                "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+            }
+        elif subset == "bnb":
+            return {
+                "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
+                "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
+            }
+        elif subset == "gptq":
+            return {
+                "4bit-gptq-exllama-v1": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "gptq",
+                    "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
+                },
+                "4bit-gptq-exllama-v2": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "gptq",
+                    "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
+                },
+            }
+        elif subset == "awq":
+            return {
+                "4bit-awq-gemm": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "awq",
+                    "quant_config": {"bits": 4, "version": "gemm"},
+                },
+                "4bit-awq-gemv": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "awq",
+                    "quant_config": {"bits": 4, "version": "gemv"},
+                },
+                "4bit-awq-exllama-v1": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "awq",
+                    "quant_config": {
+                        "bits": 4,
+                        "version": "exllama",
+                        "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
+                    },
+                },
+                "4bit-awq-exllama-v2": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "awq",
+                    "quant_config": {
+                        "bits": 4,
+                        "version": "exllama",
+                        "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
+                    },
+                },
+            }
+        else:
+            raise ValueError(f"Unknown subset: {subset}")
+
+    def _get_attention_configs(self) -> List[str]:
+        return ["eager", "sdpa", "flash_attention_2"]
+
+
+if __name__ == "__main__":
+    runner = CUDAPyTorchBenchmarkRunner()
+    runner.run_benchmarks()
diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py
new file mode 100644
index 000000000..513e6c726
--- /dev/null
+++ b/llm_perf/common/benchmark_runner.py
@@ -0,0 +1,116 @@
+import os
+import traceback
+from abc import ABC, abstractmethod
+from itertools import product
+from logging import getLogger
+from typing import Any, Dict, List, Optional
+
+from llm_perf.common.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    OPEN_LLM_LIST,
+    PRETRAINED_OPEN_LLM_LIST,
+)
+from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport
+from optimum_benchmark.logging_utils import setup_logging
+
+
+class BenchmarkRunner(ABC):
+    def __init__(self, backend: str, hardware: str, subset: Optional[str] = None, machine: Optional[str] = None):
+        self.backend = backend
+        self.hardware = hardware
+        self.subset = subset or os.getenv("SUBSET", None)
+        self.machine = machine or os.getenv("MACHINE", None)
+        self.logger = getLogger("llm-perf-backend")
+
+        if self.machine is None and self.subset is None:
+            self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.hardware}-debug"
+            self.canonical_pretrained_open_llm_list = ["gpt2"]
+            self.subset = "unquantized"
+        elif self.machine is not None and self.subset is not None:
+            self.push_repo_id = (
+                f"optimum-benchmark/llm-perf-{self.backend}-{self.hardware}-{self.subset}-{self.machine}"
+            )
+        else:
+            raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
+
+        self.attention_configs = self._get_attention_configs()
+        self.weights_configs = self._get_weights_configs(self.subset)
+
+        self.logger.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
+        self.logger.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
+        self.logger.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
+
+    @abstractmethod
+    def _get_weights_configs(self, subset: str) -> Dict[str, Dict[str, Any]]:
+        raise NotImplementedError("This method should be implemented in the child class")
+
+    @abstractmethod
+    def _get_attention_configs(self) -> List[str]:
+        raise NotImplementedError("This method should be implemented in the child class")
+
+    @abstractmethod
+    def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool:
+        raise NotImplementedError("This method should be implemented in the child class")
+
+    def run_benchmarks(self):
+        os.environ["LOG_TO_FILE"] = "0"
+        os.environ["LOG_LEVEL"] = "INFO"
+        setup_logging(level="INFO", prefix="MAIN-PROCESS")
+
+        models_attentions_weights = list(
+            product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, self.attention_configs, self.weights_configs.keys())
+        )
+
+        self.logger.info(
+            f"Running a total of {len(models_attentions_weights)} benchmarks, "
+            f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
+            f"{len(self.attention_configs)} attentions implementations "
+            f"and {len(self.weights_configs)} weights configurations."
+        )
+
+        for model, attn_implementation, weights_config in models_attentions_weights:
+            self.run_benchmark(model, attn_implementation, weights_config)
+
+    def is_benchmark_conducted(self, push_repo_id, subfolder):
+        try:
+            report = BenchmarkReport.from_pretrained(repo_id=push_repo_id, subfolder=subfolder)
+            if "traceback" in report.to_dict():
+                return False
+            else:
+                return True
+        except Exception:
+            return False
+
+    def run_benchmark(self, model: str, attn_implementation: str, weights_config: str):
+        benchmark_name = f"{weights_config}-{attn_implementation}"
+        subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+        if not self.is_benchmark_supported(weights_config, attn_implementation):
+            self.logger.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
+            return
+
+        if self.is_benchmark_conducted(self.push_repo_id, subfolder):
+            self.logger.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
+            return
+
+        benchmark_config = self.get_benchmark_config(model, attn_implementation, weights_config)
+        benchmark_config.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True)
+        self.execute_and_log_benchmark(benchmark_config, subfolder)
+
+    @abstractmethod
+    def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig:
+        raise NotImplementedError("This method should be implemented in the child class")
+
+    def execute_and_log_benchmark(self, benchmark_config: BenchmarkConfig, subfolder: str):
+        try:
+            self.logger.info(f"Running benchmark {benchmark_config.name} with model {benchmark_config.backend.model}")
+            benchmark_report = Benchmark.launch(benchmark_config)
+            benchmark_report.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True)
+            benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+            benchmark.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True)
+        except Exception:
+            self.logger.error(f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}")
+            benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
+            benchmark_report.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True)
+            benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
+            benchmark.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True)
diff --git a/llm_perf/common/hardware_config.py b/llm_perf/common/hardware_config.py
new file mode 100644
index 000000000..5b1cfec3d
--- /dev/null
+++ b/llm_perf/common/hardware_config.py
@@ -0,0 +1,25 @@
+from typing import Any, Dict, List
+
+import yaml
+
+
+class HardwareConfig:
+    def __init__(self, data: Dict[str, Any]):
+        self.machine = data["machine"]
+        self.description = data["description"]
+        self.hardware_provider = data["hardware provider"]
+        self.hardware_backend = data["hardware_backend type"]
+        self.subsets = data["subsets"]
+        self.backends = data["backends"]
+
+    def __repr__(self):
+        return (
+            f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
+            f"hardware_type={self.hardware_backend}, subsets={self.subsets}, backends={self.backends})"
+        )
+
+
+def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
+    with open(file_path, "r") as file:
+        data = yaml.safe_load(file)
+    return [HardwareConfig(config) for config in data]
diff --git a/llm_perf/utils.py b/llm_perf/common/utils.py
similarity index 75%
rename from llm_perf/utils.py
rename to llm_perf/common/utils.py
index aa4b961e5..06eba06a0 100644
--- a/llm_perf/utils.py
+++ b/llm_perf/common/utils.py
@@ -1,9 +1,4 @@
-from typing import Any, Dict, List
-
 import pandas as pd
-import yaml
-
-from optimum_benchmark.benchmark.report import BenchmarkReport
 
 INPUT_SHAPES = {"batch_size": 1, "sequence_length": 256}
 GENERATE_KWARGS = {"max_new_tokens": 64, "min_new_tokens": 64}
@@ -127,36 +122,3 @@
     "togethercomputer/RedPajama-INCITE-Base-3B-v1",
     "togethercomputer/RedPajama-INCITE-Base-7B-v0.1",
 ]
-
-
-def is_benchmark_conducted(push_repo_id, subfolder):
-    try:
-        report = BenchmarkReport.from_pretrained(repo_id=push_repo_id, subfolder=subfolder)
-        if "traceback" in report.to_dict():
-            return False
-        else:
-            return True
-    except Exception:
-        return False
-
-
-class HardwareConfig:
-    def __init__(self, data: Dict[str, Any]):
-        self.machine = data["machine"]
-        self.description = data["description"]
-        self.hardware_provider = data["hardware provider"]
-        self.hardware_backend = data["hardware_backend type"]
-        self.subsets = data["subsets"]
-        self.backends = data["backends"]
-
-    def __repr__(self):
-        return (
-            f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
-            f"hardware_type={self.hardware_backend}, subsets={self.subsets}, backends={self.backends})"
-        )
-
-
-def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
-    with open(file_path, "r") as file:
-        data = yaml.safe_load(file)
-    return [HardwareConfig(config) for config in data]
diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml
index 49819a860..5f0604e35 100644
--- a/llm_perf/hardware.yml
+++ b/llm_perf/hardware.yml
@@ -32,15 +32,4 @@
     - bnb
     - gptq
   backends:
-    - pytorch
-
-- machine: c7i
-  description: 4th-Gen-Intel-Xeon-385W 🖥️
-  hardware_provider: intel
-  hardware_backend: cpu
-  subsets:
-    - unquantized
-  backends:
-    - pytorch
-    - onnxruntime
-    - openvino
\ No newline at end of file
+    - pytorch
\ No newline at end of file
diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py
deleted file mode 100644
index def443cfe..000000000
--- a/llm_perf/update_llm_perf_cpu_pytorch.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-)
-from optimum_benchmark import (
-    Benchmark,
-    BenchmarkConfig,
-    BenchmarkReport,
-    InferenceConfig,
-    ProcessConfig,
-    PyTorchConfig,
-)
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-BACKEND = "pytorch"
-HARDWARE = "cpu"
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa"]
-
-
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-else:
-    raise ValueError(f"Subset {SUBSET} not supported")
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def is_benchmark_supported(weights_config, attn_implementation, hardware):
-    if attn_implementation == "flash_attention_2":
-        return False
-
-    return True
-
-
-def benchmark_cpu_pytorch(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig()
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cpu",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_cpu_pytorch(model, attn_implementation, weights_config)
diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py
deleted file mode 100644
index 3a216aca6..000000000
--- a/llm_perf/update_llm_perf_cuda_pytorch.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-)
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"]
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-elif SUBSET == "bnb":
-    WEIGHTS_CONFIGS = {
-        # bnb
-        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
-        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
-    }
-elif SUBSET == "gptq":
-    WEIGHTS_CONFIGS = {
-        # gptq
-        "4bit-gptq-exllama-v1": {
-            "quant_scheme": "gptq",
-            "torch_dtype": "float16",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
-        },
-        "4bit-gptq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "gptq",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
-        },
-    }
-elif SUBSET == "awq":
-    WEIGHTS_CONFIGS = {
-        # awq
-        "4bit-awq-gemm": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemm"},
-        },
-        "4bit-awq-gemv": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemv"},
-        },
-        "4bit-awq-exllama-v1": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-        "4bit-awq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-    }
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def is_benchmark_supported(weights_config, attn_implementation):
-    if attn_implementation == "flash_attention_2" and weights_config == "float32":
-        return False
-
-    return True
-
-
-def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cuda",
-        device_ids="0",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_cuda_pytorch(model, attn_implementation, weights_config)
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index df45288f5..04c138d42 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -4,8 +4,8 @@
 import pandas as pd
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
-from utils import load_hardware_configs
 
+from llm_perf.common.utils import load_hardware_configs
 from optimum_benchmark import Benchmark
 
 REPO_TYPE = "dataset"

From ae7b939563ecf5e237d908aa0bf93b8581d05185 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 13:48:57 +0000
Subject: [PATCH 51/73] refractoring done

---
 llm_perf/update_llm_perf_leaderboard.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 04c138d42..057315689 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -5,7 +5,7 @@
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
 
-from llm_perf.common.utils import load_hardware_configs
+from llm_perf.common.hardware_config import load_hardware_configs
 from optimum_benchmark import Benchmark
 
 REPO_TYPE = "dataset"

From 5c80cad3ff9293b75f8bb24137d25994b6c99af6 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 13:50:17 +0000
Subject: [PATCH 52/73] refractoring done

---
 .github/workflows/update_llm_perf_cuda_pytorch.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
index 658e63fd1..5e3f3e976 100644
--- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
@@ -53,4 +53,4 @@ jobs:
             pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
-            python llm_perf/benchmarks/update_llm_perf_cuda_pytorch.py
+            python llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py

From e75a361aa98dfc43ce71387b05d823fd6aaf8db9 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 13:51:35 +0000
Subject: [PATCH 53/73] refractoring done

---
 llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py | 5 +++--
 llm_perf/update_llm_perf_leaderboard.py                    | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
index f22b28eb0..d4ebb6d20 100644
--- a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
+++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
@@ -1,12 +1,13 @@
 from typing import Any, Dict, List
 
-from llm_perf.common.benchmark_runner import BenchmarkRunner
-from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES
 from optimum_benchmark import PyTorchConfig
 from optimum_benchmark.benchmark.config import BenchmarkConfig
 from optimum_benchmark.launchers.process.config import ProcessConfig
 from optimum_benchmark.scenarios.inference.config import InferenceConfig
 
+from ..common.benchmark_runner import BenchmarkRunner
+from ..common.utils import GENERATE_KWARGS, INPUT_SHAPES
+
 
 class CUDAPyTorchBenchmarkRunner(BenchmarkRunner):
     def __init__(self):
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 057315689..77c62e347 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -5,9 +5,10 @@
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
 
-from llm_perf.common.hardware_config import load_hardware_configs
 from optimum_benchmark import Benchmark
 
+from .common.hardware_config import load_hardware_configs
+
 REPO_TYPE = "dataset"
 MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
 PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_backend}-{subset}-{machine}"

From 07d1d32c18b28d4b8c5b957aafffe55e8adbea40 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 13:57:59 +0000
Subject: [PATCH 54/73] refractoring done

---
 llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py | 4 ++--
 llm_perf/update_llm_perf_leaderboard.py                    | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
index d4ebb6d20..89fb6e0ab 100644
--- a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
+++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
@@ -5,8 +5,8 @@
 from optimum_benchmark.launchers.process.config import ProcessConfig
 from optimum_benchmark.scenarios.inference.config import InferenceConfig
 
-from ..common.benchmark_runner import BenchmarkRunner
-from ..common.utils import GENERATE_KWARGS, INPUT_SHAPES
+from llm_perf.common.benchmark_runner import BenchmarkRunner
+from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES
 
 
 class CUDAPyTorchBenchmarkRunner(BenchmarkRunner):
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 77c62e347..9ab650fbb 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -2,13 +2,12 @@
 from glob import glob
 
 import pandas as pd
+from llm_perf.common.hardware_config import load_hardware_configs
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
 
 from optimum_benchmark import Benchmark
 
-from .common.hardware_config import load_hardware_configs
-
 REPO_TYPE = "dataset"
 MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
 PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_backend}-{subset}-{machine}"

From 34f958f6d9dfdb1b3c8e78121395ea96921f78b1 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 13:58:12 +0000
Subject: [PATCH 55/73] refractoring done

---
 .github/workflows/update_llm_perf_leaderboard.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml
index 10ed80c98..9b63c21cd 100644
--- a/.github/workflows/update_llm_perf_leaderboard.yaml
+++ b/.github/workflows/update_llm_perf_leaderboard.yaml
@@ -2,6 +2,7 @@ name: Update LLM Perf Leaderboard
 
 on:
   workflow_dispatch:
+  push:
   schedule:
     - cron: "0 */6 * * *"
 

From 35dc1cfc25280dd5bc53391a7e249bb2d1ebdf2e Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 13:59:51 +0000
Subject: [PATCH 56/73] refractoring done

---
 llm_perf/common/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 llm_perf/common/__init__.py

diff --git a/llm_perf/common/__init__.py b/llm_perf/common/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 93485157d28d9f66ca7c9c3826b6ad32c2e05973 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 14:06:57 +0000
Subject: [PATCH 57/73] refractoring done

---
 .../update_llm_perf_cuda_pytorch.py           |  5 ++---
 llm_perf/common/hardware_config.py            | 20 +++++++++----------
 llm_perf/hardware.yml                         | 12 +++--------
 llm_perf/update_llm_perf_leaderboard.py       | 15 +++++++-------
 4 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
index 89fb6e0ab..f22b28eb0 100644
--- a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
+++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
@@ -1,13 +1,12 @@
 from typing import Any, Dict, List
 
+from llm_perf.common.benchmark_runner import BenchmarkRunner
+from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES
 from optimum_benchmark import PyTorchConfig
 from optimum_benchmark.benchmark.config import BenchmarkConfig
 from optimum_benchmark.launchers.process.config import ProcessConfig
 from optimum_benchmark.scenarios.inference.config import InferenceConfig
 
-from llm_perf.common.benchmark_runner import BenchmarkRunner
-from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES
-
 
 class CUDAPyTorchBenchmarkRunner(BenchmarkRunner):
     def __init__(self):
diff --git a/llm_perf/common/hardware_config.py b/llm_perf/common/hardware_config.py
index 5b1cfec3d..296c46b28 100644
--- a/llm_perf/common/hardware_config.py
+++ b/llm_perf/common/hardware_config.py
@@ -2,24 +2,24 @@
 
 import yaml
 
+from dataclasses import dataclass
+from typing import List
 
+@dataclass
 class HardwareConfig:
-    def __init__(self, data: Dict[str, Any]):
-        self.machine = data["machine"]
-        self.description = data["description"]
-        self.hardware_provider = data["hardware provider"]
-        self.hardware_backend = data["hardware_backend type"]
-        self.subsets = data["subsets"]
-        self.backends = data["backends"]
+    machine: str
+    hardware: str
+    subsets: List[str]
+    backends: List[str]
 
     def __repr__(self):
         return (
-            f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
-            f"hardware_type={self.hardware_backend}, subsets={self.subsets}, backends={self.backends})"
+            f"HardwareConfig(machine='{self.machine}', hardware='{self.hardware}', "
+            f"subsets={self.subsets}, backends={self.backends})"
         )
 
 
 def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
     with open(file_path, "r") as file:
         data = yaml.safe_load(file)
-    return [HardwareConfig(config) for config in data]
+    return [HardwareConfig(**config) for config in data]
diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml
index 5f0604e35..40f579189 100644
--- a/llm_perf/hardware.yml
+++ b/llm_perf/hardware.yml
@@ -1,7 +1,5 @@
 - machine: 1xA10
-  description: A10-24GB-150W 🖥️
-  hardware_provider: nvidia
-  hardware_backend: cuda
+  hardware: cuda
   subsets:
     - unquantized
     - awq
@@ -11,9 +9,7 @@
     - pytorch
 
 - machine: 1xA100
-  description: A100-80GB-275W 🖥️
-  hardware_provider: nvidia
-  hardware_backend: cuda
+  hardware: cuda
   subsets:
     - unquantized
     - awq
@@ -23,9 +19,7 @@
     - pytorch
 
 - machine: 1xT4
-  description: T4-16GB-70W 🖥️
-  hardware_provider: nvidia
-  hardware_backend: cuda
+  hardware: cuda
   subsets:
     - unquantized
     - awq
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 9ab650fbb..b3cf0888f 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -2,26 +2,26 @@
 from glob import glob
 
 import pandas as pd
-from llm_perf.common.hardware_config import load_hardware_configs
 from huggingface_hub import create_repo, snapshot_download, upload_file
 from tqdm import tqdm
 
+from llm_perf.common.hardware_config import load_hardware_configs
 from optimum_benchmark import Benchmark
 
 REPO_TYPE = "dataset"
 MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
-PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_backend}-{subset}-{machine}"
+PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}"
 
 PERF_DF = "perf-df-{subset}-{machine}.csv"
 LLM_DF = "llm-df.csv"
 
 
-def gather_benchmarks(subset: str, machine: str, backend: str, hardware_backend: str):
+def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str):
     """
     Gather the benchmarks for a given machine
     """
     perf_repo_id = PERF_REPO_ID.format(
-        subset=subset, machine=machine, backend=backend, hardware_backend=hardware_backend
+        subset=subset, machine=machine, backend=backend, hardware=hardware
     )
     snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"])
 
@@ -46,12 +46,13 @@ def update_perf_dfs():
         for subset in hardware_config.subsets:
             for backend in hardware_config.backends:
                 try:
-                    gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_provider)
+                    gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware)
                 except Exception as e:
                     print(
-                        f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_provider} and {subset} with {backend}: {e}"
+                        f"Error gathering benchmarks for machine {hardware_config.machine}, "
+                        f"hardware {hardware_config.hardware}, subset {subset}, backend {backend}: {e}"
                     )
-
+                    
 
 scrapping_script = """
 git clone https://github.com/Weyaxi/scrape-open-llm-leaderboard.git

From 8b28005da2ec2a7d59bc2723d8de2c27ca984bc7 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 14:07:23 +0000
Subject: [PATCH 58/73] refractoring done

---
 llm_perf/common/hardware_config.py      | 5 ++---
 llm_perf/update_llm_perf_leaderboard.py | 6 ++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/llm_perf/common/hardware_config.py b/llm_perf/common/hardware_config.py
index 296c46b28..ed28222e2 100644
--- a/llm_perf/common/hardware_config.py
+++ b/llm_perf/common/hardware_config.py
@@ -1,9 +1,8 @@
-from typing import Any, Dict, List
+from dataclasses import dataclass
+from typing import List
 
 import yaml
 
-from dataclasses import dataclass
-from typing import List
 
 @dataclass
 class HardwareConfig:
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index b3cf0888f..80a7dfbd7 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -20,9 +20,7 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str):
     """
     Gather the benchmarks for a given machine
     """
-    perf_repo_id = PERF_REPO_ID.format(
-        subset=subset, machine=machine, backend=backend, hardware=hardware
-    )
+    perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware=hardware)
     snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"])
 
     dfs = []
@@ -52,7 +50,7 @@ def update_perf_dfs():
                         f"Error gathering benchmarks for machine {hardware_config.machine}, "
                         f"hardware {hardware_config.hardware}, subset {subset}, backend {backend}: {e}"
                     )
-                    
+
 
 scrapping_script = """
 git clone https://github.com/Weyaxi/scrape-open-llm-leaderboard.git

From 7cb3ea0bdb45ffb1e24ac708d8632862006b3e72 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 10 Sep 2024 14:32:48 +0000
Subject: [PATCH 59/73] remove push on workflow used for debugging

---
 .github/workflows/update_llm_perf_leaderboard.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml
index 9b63c21cd..10ed80c98 100644
--- a/.github/workflows/update_llm_perf_leaderboard.yaml
+++ b/.github/workflows/update_llm_perf_leaderboard.yaml
@@ -2,7 +2,6 @@ name: Update LLM Perf Leaderboard
 
 on:
   workflow_dispatch:
-  push:
   schedule:
     - cron: "0 */6 * * *"
 

From c4c888705ca5ab725b9860202974bd551114e6bb Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 12 Sep 2024 11:08:47 +0000
Subject: [PATCH 60/73] refractor pytorch cpu

---
 .../update_llm_perf_cpu_pytorch.py            |  71 +++++++
 llm_perf/common/benchmark_runner.py           |   3 +-
 llm_perf/update_llm_perf_cpu_pytorch.py       | 147 --------------
 llm_perf/update_llm_perf_cuda_pytorch.py      | 186 ------------------
 4 files changed, 72 insertions(+), 335 deletions(-)
 create mode 100644 llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py
 delete mode 100644 llm_perf/update_llm_perf_cpu_pytorch.py
 delete mode 100644 llm_perf/update_llm_perf_cuda_pytorch.py

diff --git a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py
new file mode 100644
index 000000000..62a52a67b
--- /dev/null
+++ b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py
@@ -0,0 +1,71 @@
+from typing import Any, Dict, List
+
+from llm_perf.common.benchmark_runner import BenchmarkRunner
+from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES
+from optimum_benchmark import PyTorchConfig
+from optimum_benchmark.benchmark.config import BenchmarkConfig
+from optimum_benchmark.launchers.process.config import ProcessConfig
+from optimum_benchmark.scenarios.inference.config import InferenceConfig
+
+
+class CPUPyTorchBenchmarkRunner(BenchmarkRunner):
+    def __init__(self):
+        super().__init__(backend="pytorch", hardware="cpu")
+
+    def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig:
+        assert (
+            weights_config in self.weights_configs
+        ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue"
+
+        torch_dtype = self.weights_configs[weights_config]["torch_dtype"]
+        quant_scheme = self.weights_configs[weights_config]["quant_scheme"]
+        quant_config = self.weights_configs[weights_config]["quant_config"]
+
+        launcher_config = ProcessConfig()
+        scenario_config = InferenceConfig(
+            memory=True,
+            energy=True,
+            latency=True,
+            duration=10,
+            iterations=10,
+            warmup_runs=10,
+            input_shapes=INPUT_SHAPES,
+            generate_kwargs=GENERATE_KWARGS,
+        )
+        backend_config = PyTorchConfig(
+            model=model,
+            device="cpu",
+            no_weights=True,
+            library="transformers",
+            task="text-generation",
+            torch_dtype=torch_dtype,
+            quantization_scheme=quant_scheme,
+            quantization_config=quant_config,
+            attn_implementation=attn_implementation,
+            model_kwargs={"trust_remote_code": True},
+        )
+
+        return BenchmarkConfig(
+            name=f"{weights_config}-{attn_implementation}",
+            scenario=scenario_config,
+            launcher=launcher_config,
+            backend=backend_config,
+        )
+
+    def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]:
+        if subset == "unquantized":
+            return {
+                "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
+                "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
+                "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
+            }
+        else:
+            raise ValueError(f"Unknown subset: {subset}")
+
+    def _get_attention_configs(self) -> List[str]:
+        return ["eager", "sdpa"]
+
+
+if __name__ == "__main__":
+    runner = CPUPyTorchBenchmarkRunner()
+    runner.run_benchmarks()
diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py
index 513e6c726..40673e126 100644
--- a/llm_perf/common/benchmark_runner.py
+++ b/llm_perf/common/benchmark_runner.py
@@ -48,9 +48,8 @@ def _get_weights_configs(self, subset: str) -> Dict[str, Dict[str, Any]]:
     def _get_attention_configs(self) -> List[str]:
         raise NotImplementedError("This method should be implemented in the child class")
 
-    @abstractmethod
     def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool:
-        raise NotImplementedError("This method should be implemented in the child class")
+        return True
 
     def run_benchmarks(self):
         os.environ["LOG_TO_FILE"] = "0"
diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py
deleted file mode 100644
index def443cfe..000000000
--- a/llm_perf/update_llm_perf_cpu_pytorch.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-)
-from optimum_benchmark import (
-    Benchmark,
-    BenchmarkConfig,
-    BenchmarkReport,
-    InferenceConfig,
-    ProcessConfig,
-    PyTorchConfig,
-)
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-BACKEND = "pytorch"
-HARDWARE = "cpu"
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa"]
-
-
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-else:
-    raise ValueError(f"Subset {SUBSET} not supported")
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def is_benchmark_supported(weights_config, attn_implementation, hardware):
-    if attn_implementation == "flash_attention_2":
-        return False
-
-    return True
-
-
-def benchmark_cpu_pytorch(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig()
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cpu",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_cpu_pytorch(model, attn_implementation, weights_config)
diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py
deleted file mode 100644
index 3a216aca6..000000000
--- a/llm_perf/update_llm_perf_cuda_pytorch.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-)
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"]
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-elif SUBSET == "bnb":
-    WEIGHTS_CONFIGS = {
-        # bnb
-        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
-        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
-    }
-elif SUBSET == "gptq":
-    WEIGHTS_CONFIGS = {
-        # gptq
-        "4bit-gptq-exllama-v1": {
-            "quant_scheme": "gptq",
-            "torch_dtype": "float16",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
-        },
-        "4bit-gptq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "gptq",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
-        },
-    }
-elif SUBSET == "awq":
-    WEIGHTS_CONFIGS = {
-        # awq
-        "4bit-awq-gemm": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemm"},
-        },
-        "4bit-awq-gemv": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemv"},
-        },
-        "4bit-awq-exllama-v1": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-        "4bit-awq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-    }
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def is_benchmark_supported(weights_config, attn_implementation):
-    if attn_implementation == "flash_attention_2" and weights_config == "float32":
-        return False
-
-    return True
-
-
-def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cuda",
-        device_ids="0",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_cuda_pytorch(model, attn_implementation, weights_config)

From 32626f97fd22993fdbb52bdfda8cc7553c7bc139 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 12 Sep 2024 11:09:33 +0000
Subject: [PATCH 61/73] refractor pytorch cpu

---
 llm_perf/hardware.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml
index 40f579189..1a351b674 100644
--- a/llm_perf/hardware.yml
+++ b/llm_perf/hardware.yml
@@ -25,5 +25,12 @@
     - awq
     - bnb
     - gptq
+  backends:
+    - pytorch
+
+- machine: 32vCPU-C7i
+  hardware: cpu
+  subsets:
+    - unquantized
   backends:
     - pytorch
\ No newline at end of file

From 99a00dfd3eb342981380593eb6d0c409cd392b31 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 12 Sep 2024 11:09:42 +0000
Subject: [PATCH 62/73] refractor pytorch cpu

---
 llm_perf/update_llm_perf_leaderboard.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index a99a0fc3a..619e54224 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -45,10 +45,10 @@ def update_perf_dfs():
             for backend in hardware_config.backends:
                 try:
                     gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware)
-                except Exception as e:
+                except Exception:
                     print(
-                            f"benchmark for subset: {subset}, machine: {hardware_config.machine}, backend: {backend}, hardware: {hardware_config.hardware} not found"
-                        )
+                        f"benchmark for subset: {subset}, machine: {hardware_config.machine}, backend: {backend}, hardware: {hardware_config.hardware} not found"
+                    )
 
 
 scrapping_script = """

From b27f80609089e4e90e6bde327ec48cf5324f5368 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Thu, 12 Sep 2024 16:46:16 +0000
Subject: [PATCH 63/73] fix failling workflow

---
 .github/workflows/update_llm_perf_cpu_pytorch.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
index 5a046a804..4fb972879 100644
--- a/.github/workflows/update_llm_perf_cpu_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
@@ -50,4 +50,4 @@ jobs:
             pip install packaging && pip install einops scipy optimum codecarbon
             pip install -U transformers huggingface_hub[hf_transfer]
             pip install -e .
-            python llm_perf/update_llm_perf_cpu_pytorch.py
+            python llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py

From 10c47eaf2f753c3664720b10c07eed9f84c736f9 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 17 Sep 2024 11:03:15 +0000
Subject: [PATCH 64/73] fix broken canonical list

---
 llm_perf/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_perf/utils.py b/llm_perf/utils.py
index 1f478913a..849f6adae 100644
--- a/llm_perf/utils.py
+++ b/llm_perf/utils.py
@@ -93,7 +93,7 @@
     "google/gemma-2b",
     "google/gemma-7b",
     "google/recurrentgemma-2b",
-    "google/recurrentgemma-7b",
+    "google/recurrentgemma-9b",
     "internlm/internlm-20b",
     "internlm/internlm2-20b",
     "huggyllama/llama-7b",

From 60aa33e257f53a74d9a00f3840161fc591e65c07 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Tue, 17 Sep 2024 12:04:50 +0000
Subject: [PATCH 65/73] fix broken canonical list

---
 llm_perf/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_perf/utils.py b/llm_perf/utils.py
index 849f6adae..6a5584284 100644
--- a/llm_perf/utils.py
+++ b/llm_perf/utils.py
@@ -109,7 +109,7 @@
     "microsoft/rho-math-1b-v0.1",
     "mistralai/Mistral-7B-v0.1",
     "mistralai/Mixtral-8x7B-v0.1",
-    *"mistralai/Mixtral-8x22B-v0.1",
+    "mistralai/Mixtral-8x22B-v0.1",
     "openai-community/gpt2",
     "openai-community/gpt2-large",
     "stabilityai/stablelm-3b-4e1t",

From f3bc069168614c35f04d6e4c61798c180699f0b5 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Fri, 20 Sep 2024 12:00:26 +0000
Subject: [PATCH 66/73] merge main

---
 optimum_benchmark/trackers/latency.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
index 1e0f1e95b..b6d5b0257 100644
--- a/optimum_benchmark/trackers/latency.py
+++ b/optimum_benchmark/trackers/latency.py
@@ -121,8 +121,9 @@ def __init__(self, device: str, backend: str):
         self.device = device
         self.backend = backend
         self.is_asynchronous = self.backend == "pytorch" and self.device == "cuda"
-        self.is_distributed = (self.backend != "vllm" and
-                               is_torch_distributed_available() and torch.distributed.is_initialized())
+        self.is_distributed = (
+            self.backend != "vllm" and is_torch_distributed_available() and torch.distributed.is_initialized()
+        )
 
         if self.is_asynchronous:
             LOGGER.info("\t+ Tracking latency using Pytorch CUDA events")

From b2d5f1247cbd694901f7df97e9017691d56ba473 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Sep 2024 06:56:12 +0000
Subject: [PATCH 67/73] merge main into branch

---
 llm_perf/update_llm_perf_cpu_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py
index def443cfe..39723b0fd 100644
--- a/llm_perf/update_llm_perf_cpu_pytorch.py
+++ b/llm_perf/update_llm_perf_cpu_pytorch.py
@@ -28,7 +28,7 @@
 
 if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
     PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] # noqa: F811
     SUBSET = "unquantized"
 elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
     PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}"

From 08f70e2c5329a6035b59e8f893a3b345758a8156 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Sep 2024 06:56:46 +0000
Subject: [PATCH 68/73] merge main into branch

---
 llm_perf/update_llm_perf_cpu_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py
index 39723b0fd..250355505 100644
--- a/llm_perf/update_llm_perf_cpu_pytorch.py
+++ b/llm_perf/update_llm_perf_cpu_pytorch.py
@@ -28,7 +28,7 @@
 
 if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
     PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] # noqa: F811
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]  # noqa: F811
     SUBSET = "unquantized"
 elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
     PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}"

From ab1710a27cfcba43da3337096873262c600e90f3 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Sep 2024 06:58:09 +0000
Subject: [PATCH 69/73] merge main into branch

---
 llm_perf/update_llm_perf_cpu_pytorch.py  | 147 ------------------
 llm_perf/update_llm_perf_cuda_pytorch.py | 186 -----------------------
 2 files changed, 333 deletions(-)
 delete mode 100644 llm_perf/update_llm_perf_cpu_pytorch.py
 delete mode 100644 llm_perf/update_llm_perf_cuda_pytorch.py

diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py
deleted file mode 100644
index 250355505..000000000
--- a/llm_perf/update_llm_perf_cpu_pytorch.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-)
-from optimum_benchmark import (
-    Benchmark,
-    BenchmarkConfig,
-    BenchmarkReport,
-    InferenceConfig,
-    ProcessConfig,
-    PyTorchConfig,
-)
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-BACKEND = "pytorch"
-HARDWARE = "cpu"
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]  # noqa: F811
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa"]
-
-
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-else:
-    raise ValueError(f"Subset {SUBSET} not supported")
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def is_benchmark_supported(weights_config, attn_implementation, hardware):
-    if attn_implementation == "flash_attention_2":
-        return False
-
-    return True
-
-
-def benchmark_cpu_pytorch(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig()
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cpu",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_cpu_pytorch(model, attn_implementation, weights_config)
diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py
deleted file mode 100644
index 98914f6ad..000000000
--- a/llm_perf/update_llm_perf_cuda_pytorch.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import os
-import traceback
-from itertools import product
-from logging import getLogger
-
-from llm_perf.utils import (
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
-    GENERATE_KWARGS,
-    INPUT_SHAPES,
-    OPEN_LLM_LIST,
-    PRETRAINED_OPEN_LLM_LIST,
-    is_benchmark_conducted,
-)
-from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig
-from optimum_benchmark.logging_utils import setup_logging
-
-SUBSET = os.getenv("SUBSET", None)
-MACHINE = os.getenv("MACHINE", None)
-
-if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None:
-    PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug"
-    CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"]  # noqa: F811
-    SUBSET = "unquantized"
-elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None:
-    PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}"
-else:
-    raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
-
-ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"]
-if SUBSET == "unquantized":
-    WEIGHTS_CONFIGS = {
-        # unquantized
-        "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}},
-        "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}},
-        "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}},
-    }
-elif SUBSET == "bnb":
-    WEIGHTS_CONFIGS = {
-        # bnb
-        "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}},
-        "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}},
-    }
-elif SUBSET == "gptq":
-    WEIGHTS_CONFIGS = {
-        # gptq
-        "4bit-gptq-exllama-v1": {
-            "quant_scheme": "gptq",
-            "torch_dtype": "float16",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256},
-        },
-        "4bit-gptq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "gptq",
-            "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256},
-        },
-    }
-elif SUBSET == "awq":
-    WEIGHTS_CONFIGS = {
-        # awq
-        "4bit-awq-gemm": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemm"},
-        },
-        "4bit-awq-gemv": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {"bits": 4, "version": "gemv"},
-        },
-        "4bit-awq-exllama-v1": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-        "4bit-awq-exllama-v2": {
-            "torch_dtype": "float16",
-            "quant_scheme": "awq",
-            "quant_config": {
-                "bits": 4,
-                "version": "exllama",
-                "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1},
-            },
-        },
-    }
-
-
-LOGGER = getLogger("llm-perf-backend")
-LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
-LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
-LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
-
-
-def is_benchmark_supported(weights_config, attn_implementation):
-    if attn_implementation == "flash_attention_2" and weights_config == "float32":
-        return False
-
-    return True
-
-
-def benchmark_cuda_pytorch(model, attn_implementation, weights_config):
-    benchmark_name = f"{weights_config}-{attn_implementation}"
-    subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
-
-    torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"]
-    quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"]
-    quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"]
-
-    if not is_benchmark_supported(weights_config, attn_implementation):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
-        return
-
-    if is_benchmark_conducted(PUSH_REPO_ID, subfolder):
-        LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
-        return
-
-    launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
-    scenario_config = InferenceConfig(
-        memory=True,
-        energy=True,
-        latency=True,
-        duration=10,
-        iterations=10,
-        warmup_runs=10,
-        input_shapes=INPUT_SHAPES,
-        generate_kwargs=GENERATE_KWARGS,
-    )
-    backend_config = PyTorchConfig(
-        model=model,
-        device="cuda",
-        device_ids="0",
-        no_weights=True,
-        library="transformers",
-        task="text-generation",
-        torch_dtype=torch_dtype,
-        quantization_scheme=quant_scheme,
-        quantization_config=quant_config,
-        attn_implementation=attn_implementation,
-        model_kwargs={"trust_remote_code": True},
-    )
-
-    benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
-    )
-
-    benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    try:
-        LOGGER.info(f"Running benchmark {benchmark_name} with model {model}")
-        benchmark_report = Benchmark.launch(benchmark_config)
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-    except Exception:
-        LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}")
-        benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
-        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-        benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
-        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True)
-
-
-if __name__ == "__main__":
-    # for isolated process
-    os.environ["LOG_TO_FILE"] = "0"
-    os.environ["LOG_LEVEL"] = "INFO"
-
-    # for main process
-    setup_logging(level="INFO", prefix="MAIN-PROCESS")
-
-    models_attentions_weights = list(
-        product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys())
-    )
-
-    LOGGER.info(
-        f"Running a total of {len(models_attentions_weights)} benchmarks, "
-        f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-        f"{len(ATTENTION_CONFIGS)} attentions implementations "
-        f"and {len(WEIGHTS_CONFIGS)} weights configurations."
-    )
-
-    for model, attn_implementation, weights_config in models_attentions_weights:
-        benchmark_cuda_pytorch(model, attn_implementation, weights_config)

From 25128277e1c6a2cdbfa82d2b0bd0efd1f5b3cc72 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Sep 2024 07:08:25 +0000
Subject: [PATCH 70/73] add new label system

---
 .../update_llm_perf_cpu_pytorch.yaml          | 16 ++++++++++++++++
 .../update_llm_perf_cuda_pytorch.yaml         |  5 +++++
 .../update_llm_perf_leaderboard.yaml          | 19 ++++++++++++++++++-
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
index 9e002d70b..e24a1aa69 100644
--- a/.github/workflows/update_llm_perf_cpu_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
@@ -4,6 +4,18 @@ on:
   workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - labeled
+      - unlabeled
 
 concurrency:
   cancel-in-progress: true
@@ -14,6 +26,10 @@ env:
 
 jobs:
   run_benchmarks:
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains( github.event.pull_request.labels.*.name, 'leaderboard')}}
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
index ab36c3b9c..9a60ab931 100644
--- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
@@ -14,6 +14,11 @@ env:
 
 jobs:
   run_benchmarks:
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains( github.event.pull_request.labels.*.name, 'leaderboard')}}
+
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml
index 10ed80c98..f0a6b7a43 100644
--- a/.github/workflows/update_llm_perf_leaderboard.yaml
+++ b/.github/workflows/update_llm_perf_leaderboard.yaml
@@ -4,13 +4,30 @@ on:
   workflow_dispatch:
   schedule:
     - cron: "0 */6 * * *"
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - labeled
+      - unlabeled
 
 concurrency:
   cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
 jobs:
   update_llm_perf_leaderboard:
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains( github.event.pull_request.labels.*.name, 'leaderboard')}}
+
     runs-on: ubuntu-latest
     steps:
       - name: Checkout

From defc78abaa3819db117a3d256922db62260a4eda Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Sep 2024 07:09:51 +0000
Subject: [PATCH 71/73] add new label system

---
 .github/workflows/update_llm_perf_cpu_pytorch.yaml |  2 +-
 .../workflows/update_llm_perf_cuda_pytorch.yaml    | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
index e24a1aa69..eadacc26a 100644
--- a/.github/workflows/update_llm_perf_cpu_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
@@ -19,7 +19,7 @@ on:
 
 concurrency:
   cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
 env:
   IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu
diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
index 9a60ab931..01aed81e7 100644
--- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml
+++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
@@ -4,10 +4,22 @@ on:
   workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - labeled
+      - unlabeled
 
 concurrency:
   cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
 env:
   IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda

From 89b6a97a171738c6d66996e253e7990946dfb3ba Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Sep 2024 09:34:59 +0000
Subject: [PATCH 72/73] add new chnages from review

---
 .../update_llm_perf_cpu_pytorch.py            | 32 +++++++++--
 .../update_llm_perf_cuda_pytorch.py           | 35 +++++++++---
 llm_perf/common/benchmark_runner.py           | 57 ++++++++++---------
 3 files changed, 85 insertions(+), 39 deletions(-)

diff --git a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py
index 62a52a67b..c6ce76290 100644
--- a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py
+++ b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py
@@ -1,18 +1,39 @@
+from itertools import product
 from typing import Any, Dict, List
 
-from llm_perf.common.benchmark_runner import BenchmarkRunner
-from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES
+from llm_perf.common.benchmark_runner import LLMPerfBenchmarkManager
+from llm_perf.common.utils import CANONICAL_PRETRAINED_OPEN_LLM_LIST, GENERATE_KWARGS, INPUT_SHAPES
 from optimum_benchmark import PyTorchConfig
 from optimum_benchmark.benchmark.config import BenchmarkConfig
 from optimum_benchmark.launchers.process.config import ProcessConfig
 from optimum_benchmark.scenarios.inference.config import InferenceConfig
 
 
-class CPUPyTorchBenchmarkRunner(BenchmarkRunner):
+class CPUPyTorchBenchmarkRunner(LLMPerfBenchmarkManager):
     def __init__(self):
-        super().__init__(backend="pytorch", hardware="cpu")
+        super().__init__(backend="pytorch", device="cpu")
+
+        self.attention_configs = self._get_attention_configs()
+        assert self.subset is not None, "SUBSET environment variable must be set for benchmarking"
+        self.weights_configs = self._get_weights_configs(self.subset)
+
+    def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
+        return [
+            {"model": model, "attn_implementation": attn_impl, "weights_config": weights_cfg}
+            for model, attn_impl, weights_cfg in product(
+                CANONICAL_PRETRAINED_OPEN_LLM_LIST, self.attention_configs, self.weights_configs.keys()
+            )
+        ]
+
+    def get_benchmark_name(self, model: str, **kwargs) -> str:
+        weights_config = kwargs["weights_config"]
+        attn_implementation = kwargs["attn_implementation"]
+        return f"{model}-{weights_config}-{attn_implementation}"
+
+    def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
+        weights_config = kwargs["weights_config"]
+        attn_implementation = kwargs["attn_implementation"]
 
-    def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig:
         assert (
             weights_config in self.weights_configs
         ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue"
@@ -65,7 +86,6 @@ def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]:
     def _get_attention_configs(self) -> List[str]:
         return ["eager", "sdpa"]
 
-
 if __name__ == "__main__":
     runner = CPUPyTorchBenchmarkRunner()
     runner.run_benchmarks()
diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
index f22b28eb0..82aab3db9 100644
--- a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
+++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py
@@ -1,23 +1,44 @@
+from itertools import product
 from typing import Any, Dict, List
 
-from llm_perf.common.benchmark_runner import BenchmarkRunner
-from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES
+from llm_perf.common.benchmark_runner import LLMPerfBenchmarkManager
+from llm_perf.common.utils import CANONICAL_PRETRAINED_OPEN_LLM_LIST, GENERATE_KWARGS, INPUT_SHAPES
 from optimum_benchmark import PyTorchConfig
 from optimum_benchmark.benchmark.config import BenchmarkConfig
 from optimum_benchmark.launchers.process.config import ProcessConfig
 from optimum_benchmark.scenarios.inference.config import InferenceConfig
 
 
-class CUDAPyTorchBenchmarkRunner(BenchmarkRunner):
+class CUDAPyTorchBenchmarkRunner(LLMPerfBenchmarkManager):
     def __init__(self):
-        super().__init__(backend="pytorch", hardware="cuda")
+        super().__init__(backend="pytorch", device="cuda")
 
-    def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool:
-        if attn_implementation == "flash_attention_2" and weights_config == "float32":
+        self.attention_configs = self._get_attention_configs()
+        assert self.subset is not None, "SUBSET environment variable must be set for benchmarking"
+        self.weights_configs = self._get_weights_configs(self.subset)
+
+    def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
+        return [
+            {"model": model, "attn_implementation": attn_impl, "weights_config": weights_cfg}
+            for model, attn_impl, weights_cfg in product(
+                CANONICAL_PRETRAINED_OPEN_LLM_LIST, self.attention_configs, self.weights_configs.keys()
+            )
+        ]
+
+    def get_benchmark_name(self, model: str, **kwargs) -> str:
+        weights_config = kwargs["weights_config"]
+        attn_implementation = kwargs["attn_implementation"]
+        return f"{model}-{weights_config}-{attn_implementation}"
+
+    def is_benchmark_supported(self, **kwargs) -> bool:
+        if kwargs["attn_implementation"] == "flash_attention_2" and kwargs["weights_config"] == "float32":
             return False
         return True
 
-    def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig:
+    def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
+        weights_config = kwargs["weights_config"]
+        attn_implementation = kwargs["attn_implementation"]
+
         assert (
             weights_config in self.weights_configs
         ), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue"
diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py
index 40673e126..a889ade4b 100644
--- a/llm_perf/common/benchmark_runner.py
+++ b/llm_perf/common/benchmark_runner.py
@@ -1,7 +1,6 @@
 import os
 import traceback
 from abc import ABC, abstractmethod
-from itertools import product
 from logging import getLogger
 from typing import Any, Dict, List, Optional
 
@@ -14,28 +13,23 @@
 from optimum_benchmark.logging_utils import setup_logging
 
 
-class BenchmarkRunner(ABC):
-    def __init__(self, backend: str, hardware: str, subset: Optional[str] = None, machine: Optional[str] = None):
+class LLMPerfBenchmarkManager(ABC):
+    def __init__(self, backend: str, device: str, subset: Optional[str] = None, machine: Optional[str] = None):
         self.backend = backend
-        self.hardware = hardware
+        self.device = device
         self.subset = subset or os.getenv("SUBSET", None)
         self.machine = machine or os.getenv("MACHINE", None)
         self.logger = getLogger("llm-perf-backend")
 
         if self.machine is None and self.subset is None:
-            self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.hardware}-debug"
+            self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.device}-debug"
             self.canonical_pretrained_open_llm_list = ["gpt2"]
             self.subset = "unquantized"
         elif self.machine is not None and self.subset is not None:
-            self.push_repo_id = (
-                f"optimum-benchmark/llm-perf-{self.backend}-{self.hardware}-{self.subset}-{self.machine}"
-            )
+            self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.device}-{self.subset}-{self.machine}"
         else:
             raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging")
 
-        self.attention_configs = self._get_attention_configs()
-        self.weights_configs = self._get_weights_configs(self.subset)
-
         self.logger.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
         self.logger.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}")
         self.logger.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}")
@@ -48,27 +42,32 @@ def _get_weights_configs(self, subset: str) -> Dict[str, Dict[str, Any]]:
     def _get_attention_configs(self) -> List[str]:
         raise NotImplementedError("This method should be implemented in the child class")
 
-    def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool:
+    def is_benchmark_supported(self, **kwargs) -> bool:
+        """
+        Can be overridden by child classes to exclude unsupported configurations
+        """
         return True
 
+    @abstractmethod
+    def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
+        raise NotImplementedError("This method should be implemented in the child class")
+
     def run_benchmarks(self):
         os.environ["LOG_TO_FILE"] = "0"
         os.environ["LOG_LEVEL"] = "INFO"
         setup_logging(level="INFO", prefix="MAIN-PROCESS")
 
-        models_attentions_weights = list(
-            product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, self.attention_configs, self.weights_configs.keys())
-        )
+        benchmarks_to_run = self.get_list_of_benchmarks_to_run()
 
         self.logger.info(
-            f"Running a total of {len(models_attentions_weights)} benchmarks, "
-            f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, "
-            f"{len(self.attention_configs)} attentions implementations "
-            f"and {len(self.weights_configs)} weights configurations."
+            f"Running a total of {len(benchmarks_to_run)} benchmarks, "
+            f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models"
         )
 
-        for model, attn_implementation, weights_config in models_attentions_weights:
-            self.run_benchmark(model, attn_implementation, weights_config)
+        for benchmark_name in benchmarks_to_run:
+            assert "model" in benchmark_name, "each benchmark should have a model"
+
+            self.run_benchmark(**benchmark_name)
 
     def is_benchmark_conducted(self, push_repo_id, subfolder):
         try:
@@ -80,11 +79,17 @@ def is_benchmark_conducted(self, push_repo_id, subfolder):
         except Exception:
             return False
 
-    def run_benchmark(self, model: str, attn_implementation: str, weights_config: str):
-        benchmark_name = f"{weights_config}-{attn_implementation}"
+    @abstractmethod
+    def get_benchmark_name(self, model: str, **kwargs) -> str:
+        raise NotImplementedError("This method should be implemented in the child class")
+
+    def run_benchmark(self, **kwargs):
+        model = kwargs["model"]
+
+        benchmark_name = self.get_benchmark_name(model, **kwargs)
         subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
 
-        if not self.is_benchmark_supported(weights_config, attn_implementation):
+        if not self.is_benchmark_supported(**kwargs):
             self.logger.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported")
             return
 
@@ -92,12 +97,12 @@ def run_benchmark(self, model: str, attn_implementation: str, weights_config: st
             self.logger.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted")
             return
 
-        benchmark_config = self.get_benchmark_config(model, attn_implementation, weights_config)
+        benchmark_config = self.get_benchmark_config(model, **kwargs)
         benchmark_config.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True)
         self.execute_and_log_benchmark(benchmark_config, subfolder)
 
     @abstractmethod
-    def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig:
+    def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
         raise NotImplementedError("This method should be implemented in the child class")
 
     def execute_and_log_benchmark(self, benchmark_config: BenchmarkConfig, subfolder: str):

From 3130c87c8f664ee5109fb8a6d327d643e2fe62d8 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Sep 2024 09:41:45 +0000
Subject: [PATCH 73/73] add new chnages from review

---
 llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py | 1 +
 llm_perf/common/benchmark_runner.py                       | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py
index c6ce76290..c27f5e220 100644
--- a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py
+++ b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py
@@ -86,6 +86,7 @@ def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]:
     def _get_attention_configs(self) -> List[str]:
         return ["eager", "sdpa"]
 
+
 if __name__ == "__main__":
     runner = CPUPyTorchBenchmarkRunner()
     runner.run_benchmarks()
diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py
index a889ade4b..def30dc20 100644
--- a/llm_perf/common/benchmark_runner.py
+++ b/llm_perf/common/benchmark_runner.py
@@ -84,7 +84,7 @@ def get_benchmark_name(self, model: str, **kwargs) -> str:
         raise NotImplementedError("This method should be implemented in the child class")
 
     def run_benchmark(self, **kwargs):
-        model = kwargs["model"]
+        model = kwargs.pop("model")
 
         benchmark_name = self.get_benchmark_name(model, **kwargs)
         subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
@@ -112,8 +112,10 @@ def execute_and_log_benchmark(self, benchmark_config: BenchmarkConfig, subfolder
             benchmark_report.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True)
             benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
             benchmark.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True)
-        except Exception:
-            self.logger.error(f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}")
+        except Exception as e:
+            self.logger.error(
+                f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}, error:\n{e}"
+            )
             benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()})
             benchmark_report.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True)
             benchmark = Benchmark(config=benchmark_config, report=benchmark_report)