From 92a5cea4dd0c750c1de59504f213068dc3bba59c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 21 Aug 2024 13:17:29 +0000 Subject: [PATCH 01/73] add intel pytorch ort and openvino to leaderboard --- .gitignore | 5 + .../cuda}/update_llm_perf_cuda_pytorch.py | 9 +- .../intel/update_llm_perf_intel_openvino.py | 179 +++++++++++++++++ .../intel/update_llm_perf_intel_ort.py | 182 ++++++++++++++++++ .../intel/update_llm_perf_intel_pytorch.py | 182 ++++++++++++++++++ llm_perf/utils.py | 10 +- 6 files changed, 560 insertions(+), 7 deletions(-) rename llm_perf/{ => hardware/cuda}/update_llm_perf_cuda_pytorch.py (97%) create mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_openvino.py create mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_ort.py create mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py diff --git a/.gitignore b/.gitignore index f26fda31e..31f9b57f0 100644 --- a/.gitignore +++ b/.gitignore @@ -173,6 +173,11 @@ experiments/ amdsmi/ amd-* +# Code carbon +generate_codecarbon.json +task_codecarbon.json +prefill_codecarbon.json + # Mac specific .DS_Store outputs/ \ No newline at end of file diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py similarity index 97% rename from llm_perf/update_llm_perf_cuda_pytorch.py rename to llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py index 51ab678f3..8b65e1f5c 100644 --- a/llm_perf/update_llm_perf_cuda_pytorch.py +++ b/llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py @@ -17,6 +17,7 @@ SUBSET = os.getenv("SUBSET", None) MACHINE = os.getenv("MACHINE", None) +HARDWARE = "cuda" if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: @@ -28,7 +29,7 @@ else: raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") -ATTENTION_COFIGS = ["eager", "sdpa", "flash_attention_2"] +ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"] if SUBSET == "unquantized": WEIGHTS_CONFIGS = { # unquantized @@ -104,7 +105,7 @@ def benchmark_cuda_pytorch(model, attn_implementation, weights_config): quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - if not is_benchmark_supported(weights_config, attn_implementation): + if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") return @@ -167,13 +168,13 @@ def benchmark_cuda_pytorch(model, attn_implementation, weights_config): setup_logging(level="INFO", prefix="MAIN-PROCESS") models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_COFIGS, WEIGHTS_CONFIGS.keys()) + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) ) LOGGER.info( f"Running a total of {len(models_attentions_weights)} benchmarks, " f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_COFIGS)} attentions implementations " + f"{len(ATTENTION_CONFIGS)} attentions implementations " f"and {len(WEIGHTS_CONFIGS)} weights configurations." ) diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py new file mode 100644 index 000000000..1aa6052a9 --- /dev/null +++ b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py @@ -0,0 +1,179 @@ +import os +import traceback +from itertools import product +from logging import getLogger + +from llm_perf.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + GENERATE_KWARGS, + INPUT_SHAPES, + OPEN_LLM_LIST, + PRETRAINED_OPEN_LLM_LIST, + is_benchmark_conducted, + is_benchmark_supported, +) +from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, OVConfig +from optimum_benchmark.logging_utils import setup_logging + +SUBSET = os.getenv("SUBSET", None) +MACHINE = os.getenv("MACHINE", None) +HARDWARE = "intel" + + +if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: + PUSH_REPO_ID = "optimum-benchmark/llm-perf-openvino-intel-debug" + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] + SUBSET = "unquantized" +elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-openvino-intel-{SUBSET}-{MACHINE}" +else: + raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") + +ATTENTION_CONFIGS = ["eager", "sdpa"] +if SUBSET == "unquantized": + WEIGHTS_CONFIGS = { + # unquantized + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } +elif SUBSET == "bnb": + WEIGHTS_CONFIGS = { + # bnb + "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, + "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, + } +elif SUBSET == "gptq": + WEIGHTS_CONFIGS = { + # gptq + "4bit-gptq-exllama-v1": { + "quant_scheme": "gptq", + "torch_dtype": "float16", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, + }, + "4bit-gptq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, + }, + } +elif SUBSET == "awq": + WEIGHTS_CONFIGS = { + # awq + "4bit-awq-gemm": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemm"}, + }, + "4bit-awq-gemv": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemv"}, + }, + "4bit-awq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + "4bit-awq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + } + + +LOGGER = getLogger("llm-perf-backend") +LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") +LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") +LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") + + +def benchmark_intel_openvino(model, attn_implementation, weights_config): + benchmark_name = f"{weights_config}-{attn_implementation}" + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] + quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] + quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] + + if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") + return + + if is_benchmark_conducted(PUSH_REPO_ID, subfolder): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") + return + + launcher_config = ProcessConfig() + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + backend_config = OVConfig( + model=model, + device="cpu", + device_ids="0", + no_weights=True, + library="transformers", + task="text-generation", + quantization_config=quant_config, + model_kwargs={"trust_remote_code": True}, + ) + + benchmark_config = BenchmarkConfig( + name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config + ) + + benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + try: + LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + except Exception: + LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") + benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + +if __name__ == "__main__": + # for isolated process + os.environ["LOG_TO_FILE"] = "0" + os.environ["LOG_LEVEL"] = "INFO" + + # for main process + setup_logging(level="INFO", prefix="MAIN-PROCESS") + + models_attentions_weights = list( + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) + ) + + LOGGER.info( + f"Running a total of {len(models_attentions_weights)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " + f"{len(ATTENTION_CONFIGS)} attentions implementations " + f"and {len(WEIGHTS_CONFIGS)} weights configurations." + ) + + for model, attn_implementation, weights_config in models_attentions_weights: + benchmark_intel_openvino(model, attn_implementation, weights_config) diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py new file mode 100644 index 000000000..e7bb254c8 --- /dev/null +++ b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py @@ -0,0 +1,182 @@ +import os +import traceback +from itertools import product +from logging import getLogger + +from llm_perf.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + GENERATE_KWARGS, + INPUT_SHAPES, + OPEN_LLM_LIST, + PRETRAINED_OPEN_LLM_LIST, + is_benchmark_conducted, + is_benchmark_supported, +) +from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, ORTConfig +from optimum_benchmark.logging_utils import setup_logging + +SUBSET = os.getenv("SUBSET", None) +MACHINE = os.getenv("MACHINE", None) +HARDWARE = "intel" + + +if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: + PUSH_REPO_ID = "optimum-benchmark/llm-perf-ort-intel-debug" + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] + SUBSET = "unquantized" +elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-ort-intel-{SUBSET}-{MACHINE}" +else: + raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") + +ATTENTION_CONFIGS = ["eager", "sdpa"] +if SUBSET == "unquantized": + WEIGHTS_CONFIGS = { + # unquantized + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } +elif SUBSET == "bnb": + WEIGHTS_CONFIGS = { + # bnb + "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, + "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, + } +elif SUBSET == "gptq": + WEIGHTS_CONFIGS = { + # gptq + "4bit-gptq-exllama-v1": { + "quant_scheme": "gptq", + "torch_dtype": "float16", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, + }, + "4bit-gptq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, + }, + } +elif SUBSET == "awq": + WEIGHTS_CONFIGS = { + # awq + "4bit-awq-gemm": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemm"}, + }, + "4bit-awq-gemv": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemv"}, + }, + "4bit-awq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + "4bit-awq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + } + + +LOGGER = getLogger("llm-perf-backend") +LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") +LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") +LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") + + +def benchmark_intel_ort(model, attn_implementation, weights_config): + benchmark_name = f"{weights_config}-{attn_implementation}" + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] + quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] + quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] + + if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") + return + + if is_benchmark_conducted(PUSH_REPO_ID, subfolder): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") + return + + launcher_config = ProcessConfig() + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + backend_config = ORTConfig( + model=model, + device="cpu", + device_ids="0", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + # quantization_scheme=quant_scheme, + quantization_config=quant_config, + # attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + + benchmark_config = BenchmarkConfig( + name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config + ) + + benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + # try: + LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + # except Exception: + # LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") + # benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) + # benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + # benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + # benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + +if __name__ == "__main__": + # for isolated process + os.environ["LOG_TO_FILE"] = "0" + os.environ["LOG_LEVEL"] = "INFO" + + # for main process + setup_logging(level="INFO", prefix="MAIN-PROCESS") + + models_attentions_weights = list( + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) + ) + + LOGGER.info( + f"Running a total of {len(models_attentions_weights)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " + f"{len(ATTENTION_CONFIGS)} attentions implementations " + f"and {len(WEIGHTS_CONFIGS)} weights configurations." + ) + + for model, attn_implementation, weights_config in models_attentions_weights: + benchmark_intel_ort(model, attn_implementation, weights_config) diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py new file mode 100644 index 000000000..bceb89d58 --- /dev/null +++ b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py @@ -0,0 +1,182 @@ +import os +import traceback +from itertools import product +from logging import getLogger + +from llm_perf.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + GENERATE_KWARGS, + INPUT_SHAPES, + OPEN_LLM_LIST, + PRETRAINED_OPEN_LLM_LIST, + is_benchmark_conducted, + is_benchmark_supported, +) +from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig +from optimum_benchmark.logging_utils import setup_logging + +SUBSET = os.getenv("SUBSET", None) +MACHINE = os.getenv("MACHINE", None) +HARDWARE = "intel" + + +if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: + PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug" + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] + SUBSET = "unquantized" +elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}" +else: + raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") + +ATTENTION_CONFIGS = ["eager", "sdpa"] +if SUBSET == "unquantized": + WEIGHTS_CONFIGS = { + # unquantized + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } +elif SUBSET == "bnb": + WEIGHTS_CONFIGS = { + # bnb + "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, + "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, + } +elif SUBSET == "gptq": + WEIGHTS_CONFIGS = { + # gptq + "4bit-gptq-exllama-v1": { + "quant_scheme": "gptq", + "torch_dtype": "float16", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, + }, + "4bit-gptq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, + }, + } +elif SUBSET == "awq": + WEIGHTS_CONFIGS = { + # awq + "4bit-awq-gemm": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemm"}, + }, + "4bit-awq-gemv": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemv"}, + }, + "4bit-awq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + "4bit-awq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + } + + +LOGGER = getLogger("llm-perf-backend") +LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") +LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") +LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") + + +def benchmark_intel_pytorch(model, attn_implementation, weights_config): + benchmark_name = f"{weights_config}-{attn_implementation}" + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] + quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] + quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] + + if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") + return + + if is_benchmark_conducted(PUSH_REPO_ID, subfolder): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") + return + + launcher_config = ProcessConfig() + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + backend_config = PyTorchConfig( + model=model, + device="cpu", + device_ids="0", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_scheme=quant_scheme, + quantization_config=quant_config, + attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + + benchmark_config = BenchmarkConfig( + name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config + ) + + benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + try: + LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + except Exception: + LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") + benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + +if __name__ == "__main__": + # for isolated process + os.environ["LOG_TO_FILE"] = "0" + os.environ["LOG_LEVEL"] = "INFO" + + # for main process + setup_logging(level="INFO", prefix="MAIN-PROCESS") + + models_attentions_weights = list( + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) + ) + + LOGGER.info( + f"Running a total of {len(models_attentions_weights)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " + f"{len(ATTENTION_CONFIGS)} attentions implementations " + f"and {len(WEIGHTS_CONFIGS)} weights configurations." + ) + + for model, attn_implementation, weights_config in models_attentions_weights: + benchmark_intel_pytorch(model, attn_implementation, weights_config) diff --git a/llm_perf/utils.py b/llm_perf/utils.py index 6665536c7..ea25cdab3 100644 --- a/llm_perf/utils.py +++ b/llm_perf/utils.py @@ -137,8 +137,12 @@ def is_benchmark_conducted(push_repo_id, subfolder): return False -def is_benchmark_supported(weights_config, attn_implementation): - if attn_implementation == "flash_attention_2" and weights_config == "float32": - return False +def is_benchmark_supported(weights_config, attn_implementation, hardware): + if hardware == "cuda": + if attn_implementation == "flash_attention_2" and weights_config == "float32": + return False + elif hardware == "intel": + if attn_implementation == "flash_attention_2": + return False return True From 01680638d9acab85c96eb879e63b76b6eb658b63 Mon Sep 17 00:00:00 2001 From: baptiste Date: Wed, 21 Aug 2024 14:19:58 +0000 Subject: [PATCH 02/73] add intel pytorch ort and openvino to leaderboard --- .github/workflows/update_llm_perf_cuda_pytorch.yaml | 2 +- llm_perf/hardware/intel/update_llm_perf_intel_openvino.py | 4 +--- llm_perf/hardware/intel/update_llm_perf_intel_ort.py | 4 +--- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index 0ab646ab9..567128e5c 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -53,4 +53,4 @@ jobs: pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . - python llm_perf/update_llm_perf_cuda_pytorch.py + python llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py index 1aa6052a9..869598285 100644 --- a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py +++ b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py @@ -12,7 +12,7 @@ is_benchmark_conducted, is_benchmark_supported, ) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, OVConfig +from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, OVConfig, ProcessConfig from optimum_benchmark.logging_utils import setup_logging SUBSET = os.getenv("SUBSET", None) @@ -101,8 +101,6 @@ def benchmark_intel_openvino(model, attn_implementation, weights_config): benchmark_name = f"{weights_config}-{attn_implementation}" subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py index e7bb254c8..5b8f5c3eb 100644 --- a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py +++ b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py @@ -1,5 +1,4 @@ import os -import traceback from itertools import product from logging import getLogger @@ -12,7 +11,7 @@ is_benchmark_conducted, is_benchmark_supported, ) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, ORTConfig +from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ORTConfig, ProcessConfig from optimum_benchmark.logging_utils import setup_logging SUBSET = os.getenv("SUBSET", None) @@ -102,7 +101,6 @@ def benchmark_intel_ort(model, attn_implementation, weights_config): subfolder = f"{benchmark_name}/{model.replace('/', '--')}" torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): From 0bc416f1bfaef797f2f6ad1104e520daf6cf96c1 Mon Sep 17 00:00:00 2001 From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:22:27 +0200 Subject: [PATCH 03/73] Add support for intel in leaderboard --- .../update_llm_perf_intel_pytorch.yml | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yml diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml new file mode 100644 index 000000000..b7254edb9 --- /dev/null +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -0,0 +1,55 @@ +name: Update LLM Perf Benchmarks - CUDA PyTorch + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +env: + IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda + +jobs: + run_benchmarks: + strategy: + fail-fast: false + matrix: + subset: [unquantized, bnb, awq, gptq] + + machine: [ + {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, + ] + + runs-on: ${{ matrix.machine.runs-on }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run benchmarks + uses: addnab/docker-run-action@v3 + env: + SUBSET: ${{ matrix.subset }} + MACHINE: ${{ matrix.machine.name }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + with: + image: ${{ env.IMAGE }} + options: | + --rm + --gpus all + --shm-size 64G + --env SUBSET + --env MACHINE + --env HF_TOKEN + --env MKL_THREADING_LAYER=GNU + --env HF_HUB_ENABLE_HF_TRANSFER=1 + --volume ${{ github.workspace }}:/workspace + --workdir /workspace + run: | + pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon + pip install -U transformers huggingface_hub[hf_transfer] + pip install -e . + python llm_perf/hardware/intel/update_llm_perf_cuda_pytorch.py From 85f62e653b362dcec0cb41140f2fa4b34f8acb2c Mon Sep 17 00:00:00 2001 From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:29:12 +0200 Subject: [PATCH 04/73] Update update_llm_perf_intel_pytorch.yml --- .github/workflows/update_llm_perf_intel_pytorch.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index b7254edb9..4d8413c49 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -1,6 +1,8 @@ name: Update LLM Perf Benchmarks - CUDA PyTorch on: + pull_request: + push: workflow_dispatch: schedule: - cron: "0 0 * * *" From 7151e01b798314164c5902046e38abdfd07ba3eb Mon Sep 17 00:00:00 2001 From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:40:12 +0200 Subject: [PATCH 05/73] Update update_llm_perf_intel_pytorch.yml --- .github/workflows/update_llm_perf_intel_pytorch.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index 4d8413c49..b7254edb9 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -1,8 +1,6 @@ name: Update LLM Perf Benchmarks - CUDA PyTorch on: - pull_request: - push: workflow_dispatch: schedule: - cron: "0 0 * * *" From c92f818f7c26a1bddae78adfc92e3827a85b2eb3 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 22 Aug 2024 07:05:49 +0000 Subject: [PATCH 06/73] add new llm_perf_tests --- .../update_llm_perf_intel_pytorch.yaml | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml new file mode 100644 index 000000000..2eb8b906c --- /dev/null +++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml @@ -0,0 +1,55 @@ +name: Update LLM Perf Benchmarks - Intel PyTorch + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +env: + IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda + +jobs: + run_benchmarks: + strategy: + fail-fast: false + matrix: + subset: [unquantized, bnb, awq, gptq] + + machine: [ + {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, + ] + + runs-on: ${{ matrix.machine.runs-on }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run benchmarks + uses: addnab/docker-run-action@v3 + env: + SUBSET: ${{ matrix.subset }} + MACHINE: ${{ matrix.machine.name }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + with: + image: ${{ env.IMAGE }} + options: | + --rm + --gpus all + --shm-size 64G + --env SUBSET + --env MACHINE + --env HF_TOKEN + --env MKL_THREADING_LAYER=GNU + --env HF_HUB_ENABLE_HF_TRANSFER=1 + --volume ${{ github.workspace }}:/workspace + --workdir /workspace + run: | + pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon + pip install -U transformers huggingface_hub[hf_transfer] + pip install -e . + python llm_perf/hardware/intel/update_llm_perf_cuda_pytorch.py From c31e6cf5bcf9a8f9fc18ac10ee75543cbe0b8720 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 22 Aug 2024 07:15:18 +0000 Subject: [PATCH 07/73] fix workflow --- .../update_llm_perf_intel_pytorch.yaml | 55 ------------------- .../update_llm_perf_intel_pytorch.yml | 2 +- 2 files changed, 1 insertion(+), 56 deletions(-) delete mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml deleted file mode 100644 index 2eb8b906c..000000000 --- a/.github/workflows/update_llm_perf_intel_pytorch.yaml +++ /dev/null @@ -1,55 +0,0 @@ -name: Update LLM Perf Benchmarks - Intel PyTorch - -on: - workflow_dispatch: - schedule: - - cron: "0 0 * * *" - -concurrency: - cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} - -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda - -jobs: - run_benchmarks: - strategy: - fail-fast: false - matrix: - subset: [unquantized, bnb, awq, gptq] - - machine: [ - {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, - ] - - runs-on: ${{ matrix.machine.runs-on }} - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Run benchmarks - uses: addnab/docker-run-action@v3 - env: - SUBSET: ${{ matrix.subset }} - MACHINE: ${{ matrix.machine.name }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} - with: - image: ${{ env.IMAGE }} - options: | - --rm - --gpus all - --shm-size 64G - --env SUBSET - --env MACHINE - --env HF_TOKEN - --env MKL_THREADING_LAYER=GNU - --env HF_HUB_ENABLE_HF_TRANSFER=1 - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon - pip install -U transformers huggingface_hub[hf_transfer] - pip install -e . - python llm_perf/hardware/intel/update_llm_perf_cuda_pytorch.py diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index b7254edb9..2eb8b906c 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -1,4 +1,4 @@ -name: Update LLM Perf Benchmarks - CUDA PyTorch +name: Update LLM Perf Benchmarks - Intel PyTorch on: workflow_dispatch: From d4064401899fe76785197b60cd96f5fb5562bb18 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 22 Aug 2024 10:51:21 +0000 Subject: [PATCH 08/73] fix failing tests --- .github/workflows/update_llm_perf_intel_pytorch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index 2eb8b906c..9d3fd26ec 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -10,7 +10,7 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda + IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu jobs: run_benchmarks: From 20b96b2f6c7d5145e601590e5444cd2519a221b9 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 22 Aug 2024 11:05:20 +0000 Subject: [PATCH 09/73] fix failing tests --- .github/workflows/update_llm_perf_intel_pytorch.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index 9d3fd26ec..4de8077db 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -39,7 +39,6 @@ jobs: image: ${{ env.IMAGE }} options: | --rm - --gpus all --shm-size 64G --env SUBSET --env MACHINE From c7e0ec0d7a413435661a8e501a5d2f5a8b39bb84 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 22 Aug 2024 11:17:46 +0000 Subject: [PATCH 10/73] fix failing tests --- .github/workflows/update_llm_perf_intel_pytorch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index 4de8077db..6ce8aafbf 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -51,4 +51,4 @@ jobs: pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . - python llm_perf/hardware/intel/update_llm_perf_cuda_pytorch.py + python llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py From 6d7bf692cf8057067d9184bab5a0b28b8792fa79 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 22 Aug 2024 11:51:39 +0000 Subject: [PATCH 11/73] fix failing tests --- .github/workflows/update_llm_perf_intel_pytorch.yml | 4 ++-- llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index 6ce8aafbf..b285544aa 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - subset: [unquantized, bnb, awq, gptq] + subset: [unquantized] machine: [ {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, @@ -48,7 +48,7 @@ jobs: --volume ${{ github.workspace }}:/workspace --workdir /workspace run: | - pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon + pip install packaging && pip install einops scipy optimum codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . python llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py index bceb89d58..797cfb2f6 100644 --- a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py +++ b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py @@ -127,7 +127,6 @@ def benchmark_intel_pytorch(model, attn_implementation, weights_config): backend_config = PyTorchConfig( model=model, device="cpu", - device_ids="0", no_weights=True, library="transformers", task="text-generation", From 7048df5f649a97078fd806077cb7dfb2054b2d86 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 2 Sep 2024 12:24:05 +0000 Subject: [PATCH 12/73] refractoring --- .../hardware/intel/update_llm_perf_intel.py | 211 ++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 llm_perf/hardware/intel/update_llm_perf_intel.py diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py new file mode 100644 index 000000000..4ec3a09f5 --- /dev/null +++ b/llm_perf/hardware/intel/update_llm_perf_intel.py @@ -0,0 +1,211 @@ +import os +import traceback +from itertools import product +from logging import getLogger + +from llm_perf.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + GENERATE_KWARGS, + INPUT_SHAPES, + OPEN_LLM_LIST, + PRETRAINED_OPEN_LLM_LIST, + is_benchmark_conducted, + is_benchmark_supported, +) +from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig, ORTConfig, OVConfig +from optimum_benchmark.logging_utils import setup_logging + +SUBSET = os.getenv("SUBSET", None) +MACHINE = os.getenv("MACHINE", None) +HARDWARE = "intel" + + +if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: + PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug" + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] + SUBSET = "unquantized" +elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}" +else: + raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") + +ATTENTION_CONFIGS = ["eager", "sdpa"] +BACKENDS = ["pytorch", "onnxruntime", "openvino"] + + +if SUBSET == "unquantized": + WEIGHTS_CONFIGS = { + # unquantized + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } +elif SUBSET == "bnb": + WEIGHTS_CONFIGS = { + # bnb + "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, + "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, + } +elif SUBSET == "gptq": + WEIGHTS_CONFIGS = { + # gptq + "4bit-gptq-exllama-v1": { + "quant_scheme": "gptq", + "torch_dtype": "float16", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, + }, + "4bit-gptq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, + }, + } +elif SUBSET == "awq": + WEIGHTS_CONFIGS = { + # awq + "4bit-awq-gemm": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemm"}, + }, + "4bit-awq-gemv": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemv"}, + }, + "4bit-awq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + "4bit-awq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + } + + +LOGGER = getLogger("llm-perf-backend") +LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") +LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") +LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") + + +def benchmark_intel(model, attn_implementation, weights_config, backend): + benchmark_name = f"{weights_config}-{attn_implementation}-{backend}" + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] + quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] + quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] + + if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") + return + + if is_benchmark_conducted(PUSH_REPO_ID, subfolder): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") + return + + launcher_config = ProcessConfig() + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + + if backend == "pytorch": + backend_config = PyTorchConfig( + model=model, + device="cpu", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_scheme=quant_scheme, + quantization_config=quant_config, + attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + elif backend == "onnxruntime": + backend_config = ORTConfig( + model=model, + device="cpu", + device_ids="0", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_config=quant_config, + model_kwargs={"trust_remote_code": True}, + ) + elif backend == "openvino": + backend_config = OVConfig( + model=model, + device="cpu", + device_ids="0", + no_weights=True, + library="transformers", + task="text-generation", + quantization_config=quant_config, + model_kwargs={"trust_remote_code": True}, + ) + else: + raise ValueError(f"Unsupported backend: {backend}") + + benchmark_config = BenchmarkConfig( + name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config + ) + + benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + try: + LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + except Exception: + LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") + benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + +if __name__ == "__main__": + # for isolated process + os.environ["LOG_TO_FILE"] = "0" + os.environ["LOG_LEVEL"] = "INFO" + + # for main process + setup_logging(level="INFO", prefix="MAIN-PROCESS") + + models_attentions_weights = list( + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys(), BACKENDS) + ) + + LOGGER.info( + f"Running a total of {len(models_attentions_weights)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " + f"{len(ATTENTION_CONFIGS)} attentions implementations " + f"and {len(WEIGHTS_CONFIGS)} weights configurations." + ) + + for model, attn_implementation, weights_config, backend in models_attentions_weights: + benchmark_intel(model, attn_implementation, weights_config, backend) From db88b2af8f4be21bdff36b6c1283590ccf6ec804 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 2 Sep 2024 12:30:16 +0000 Subject: [PATCH 13/73] intel with multiple backends --- .github/workflows/update_llm_perf_intel_pytorch.yml | 2 +- llm_perf/hardware/intel/update_llm_perf_intel.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index b285544aa..d43277f66 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -51,4 +51,4 @@ jobs: pip install packaging && pip install einops scipy optimum codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . - python llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py + python llm_perf/hardware/intel/update_llm_perf_intel.py diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py index 4ec3a09f5..325ff9138 100644 --- a/llm_perf/hardware/intel/update_llm_perf_intel.py +++ b/llm_perf/hardware/intel/update_llm_perf_intel.py @@ -12,7 +12,16 @@ is_benchmark_conducted, is_benchmark_supported, ) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig, ORTConfig, OVConfig +from optimum_benchmark import ( + Benchmark, + BenchmarkConfig, + BenchmarkReport, + InferenceConfig, + ORTConfig, + OVConfig, + ProcessConfig, + PyTorchConfig, +) from optimum_benchmark.logging_utils import setup_logging SUBSET = os.getenv("SUBSET", None) From 1246d28ef590c87cd0ce4e8c5c3a99f157eee6e6 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 2 Sep 2024 18:31:32 +0000 Subject: [PATCH 14/73] parallelize intel llm-perf --- .../update_llm_perf_intel_pytorch.yml | 2 +- .../hardware/intel/update_llm_perf_intel.py | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index d43277f66..d66f23b7f 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: subset: [unquantized] - + backend: [pytorch, onnxruntime, openvino] machine: [ {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, ] diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py index 325ff9138..58f0a3666 100644 --- a/llm_perf/hardware/intel/update_llm_perf_intel.py +++ b/llm_perf/hardware/intel/update_llm_perf_intel.py @@ -27,6 +27,7 @@ SUBSET = os.getenv("SUBSET", None) MACHINE = os.getenv("MACHINE", None) HARDWARE = "intel" +BACKEND = os.getenv("BACKEND", None) if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: @@ -39,7 +40,6 @@ raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") ATTENTION_CONFIGS = ["eager", "sdpa"] -BACKENDS = ["pytorch", "onnxruntime", "openvino"] if SUBSET == "unquantized": @@ -109,8 +109,8 @@ LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") -def benchmark_intel(model, attn_implementation, weights_config, backend): - benchmark_name = f"{weights_config}-{attn_implementation}-{backend}" +def benchmark_intel(model, attn_implementation, weights_config): + benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" subfolder = f"{benchmark_name}/{model.replace('/', '--')}" torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] @@ -137,7 +137,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend): generate_kwargs=GENERATE_KWARGS, ) - if backend == "pytorch": + if BACKEND == "pytorch": backend_config = PyTorchConfig( model=model, device="cpu", @@ -150,7 +150,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend): attn_implementation=attn_implementation, model_kwargs={"trust_remote_code": True}, ) - elif backend == "onnxruntime": + elif BACKEND == "onnxruntime": backend_config = ORTConfig( model=model, device="cpu", @@ -162,7 +162,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend): quantization_config=quant_config, model_kwargs={"trust_remote_code": True}, ) - elif backend == "openvino": + elif BACKEND == "openvino": backend_config = OVConfig( model=model, device="cpu", @@ -174,7 +174,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend): model_kwargs={"trust_remote_code": True}, ) else: - raise ValueError(f"Unsupported backend: {backend}") + raise ValueError(f"Unsupported backend: {BACKEND}") benchmark_config = BenchmarkConfig( name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config @@ -206,7 +206,7 @@ def benchmark_intel(model, attn_implementation, weights_config, backend): setup_logging(level="INFO", prefix="MAIN-PROCESS") models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys(), BACKENDS) + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) ) LOGGER.info( @@ -216,5 +216,5 @@ def benchmark_intel(model, attn_implementation, weights_config, backend): f"and {len(WEIGHTS_CONFIGS)} weights configurations." ) - for model, attn_implementation, weights_config, backend in models_attentions_weights: - benchmark_intel(model, attn_implementation, weights_config, backend) + for model, attn_implementation, weights_config in models_attentions_weights: + benchmark_intel(model, attn_implementation, weights_config) From 2d6830e12eca7d7f7736ee8a685fff9435613933 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 2 Sep 2024 18:36:21 +0000 Subject: [PATCH 15/73] parallelize intel llm-perf --- .github/workflows/update_llm_perf_intel_pytorch.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index d66f23b7f..9719ed3ec 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -33,6 +33,7 @@ jobs: uses: addnab/docker-run-action@v3 env: SUBSET: ${{ matrix.subset }} + BACKEND: ${{ matrix.backend }} MACHINE: ${{ matrix.machine.name }} HF_TOKEN: ${{ secrets.HF_TOKEN }} with: From 801c5bfef8793bb0597673addc7ffaff711126d6 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 2 Sep 2024 18:44:07 +0000 Subject: [PATCH 16/73] parallelize intel llm-perf --- .github/workflows/update_llm_perf_intel_pytorch.yml | 4 ++-- llm_perf/hardware/intel/update_llm_perf_intel.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index 9719ed3ec..31c75e874 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -17,8 +17,8 @@ jobs: strategy: fail-fast: false matrix: - subset: [unquantized] backend: [pytorch, onnxruntime, openvino] + subset: [unquantized] machine: [ {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, ] @@ -32,8 +32,8 @@ jobs: - name: Run benchmarks uses: addnab/docker-run-action@v3 env: - SUBSET: ${{ matrix.subset }} BACKEND: ${{ matrix.backend }} + SUBSET: ${{ matrix.subset }} MACHINE: ${{ matrix.machine.name }} HF_TOKEN: ${{ secrets.HF_TOKEN }} with: diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py index 58f0a3666..f91eb5a23 100644 --- a/llm_perf/hardware/intel/update_llm_perf_intel.py +++ b/llm_perf/hardware/intel/update_llm_perf_intel.py @@ -26,9 +26,8 @@ SUBSET = os.getenv("SUBSET", None) MACHINE = os.getenv("MACHINE", None) -HARDWARE = "intel" BACKEND = os.getenv("BACKEND", None) - +HARDWARE = "intel" if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug" From 2e9526c7da3d933750fcf79d17715c629d0ffaa0 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 2 Sep 2024 18:59:51 +0000 Subject: [PATCH 17/73] parallelize intel llm-perf --- .github/workflows/update_llm_perf_intel_pytorch.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index 31c75e874..65a9bf73c 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -41,6 +41,7 @@ jobs: options: | --rm --shm-size 64G + --env BACKEND --env SUBSET --env MACHINE --env HF_TOKEN From 6d87d31a280cf5c86c69ced7e3faeca3973858ca Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 2 Sep 2024 20:18:57 +0000 Subject: [PATCH 18/73] parallelize intel llm-perf --- .github/workflows/update_llm_perf_intel_pytorch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index 65a9bf73c..5de6a8591 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -50,7 +50,7 @@ jobs: --volume ${{ github.workspace }}:/workspace --workdir /workspace run: | - pip install packaging && pip install einops scipy optimum codecarbon + pip install packaging && pip install einops scipy optimum codecarbon onnxruntime openvino pip install -U transformers huggingface_hub[hf_transfer] pip install -e . python llm_perf/hardware/intel/update_llm_perf_intel.py From 62266a6d7eb473b1e82051be31393a0975dbd581 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 2 Sep 2024 20:26:10 +0000 Subject: [PATCH 19/73] parallelize intel llm-perf --- .github/workflows/update_llm_perf_intel_pytorch.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index 5de6a8591..d48f26d61 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -50,7 +50,7 @@ jobs: --volume ${{ github.workspace }}:/workspace --workdir /workspace run: | - pip install packaging && pip install einops scipy optimum codecarbon onnxruntime openvino + pip install packaging && pip install einops scipy optimum codecarbon pip install -U transformers huggingface_hub[hf_transfer] - pip install -e . + pip install -e .[onnxruntime, openvino] python llm_perf/hardware/intel/update_llm_perf_intel.py From 0a39667ba612e17e93e1a4c76763400183fd7341 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 3 Sep 2024 05:42:06 +0000 Subject: [PATCH 20/73] parallelize intel llm-perf --- .github/workflows/update_llm_perf_intel_pytorch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml index d48f26d61..c972a8869 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -52,5 +52,5 @@ jobs: run: | pip install packaging && pip install einops scipy optimum codecarbon pip install -U transformers huggingface_hub[hf_transfer] - pip install -e .[onnxruntime, openvino] + pip install -e .[onnxruntime,openvino] python llm_perf/hardware/intel/update_llm_perf_intel.py From caf7b67393783ea7ff331f9125a9639604969cb2 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 3 Sep 2024 06:00:03 +0000 Subject: [PATCH 21/73] parallelize intel llm-perf --- llm_perf/hardware/intel/update_llm_perf_intel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py index f91eb5a23..8fe64e6d0 100644 --- a/llm_perf/hardware/intel/update_llm_perf_intel.py +++ b/llm_perf/hardware/intel/update_llm_perf_intel.py @@ -34,7 +34,7 @@ CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] SUBSET = "unquantized" elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}" + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}-{BACKEND}" else: raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") From 5890457b58c7c7614046ed1d32f98ce9345178ab Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 3 Sep 2024 11:28:14 +0000 Subject: [PATCH 22/73] parallelize intel llm-perf --- llm_perf/hardware/intel/update_llm_perf_intel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/intel/update_llm_perf_intel.py index 8fe64e6d0..5c760e1bd 100644 --- a/llm_perf/hardware/intel/update_llm_perf_intel.py +++ b/llm_perf/hardware/intel/update_llm_perf_intel.py @@ -30,11 +30,11 @@ HARDWARE = "intel" if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug" + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug" CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] SUBSET = "unquantized" elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}-{BACKEND}" + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}" else: raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") From 50bd1a2e8acb8f6da6c4870a2c01b5a763f13fe4 Mon Sep 17 00:00:00 2001 From: baptiste Date: Wed, 4 Sep 2024 10:06:06 +0000 Subject: [PATCH 23/73] update leaderboard collection to support more hardware --- llm_perf/hardware.yml | 43 +++++ .../intel/update_llm_perf_intel_openvino.py | 177 ----------------- .../intel/update_llm_perf_intel_ort.py | 180 ----------------- .../intel/update_llm_perf_intel_pytorch.py | 181 ------------------ .../update_llm_perf_cuda_pytorch.py | 0 .../{intel => }/update_llm_perf_intel.py | 0 llm_perf/hardware/utils.py | 20 ++ llm_perf/update_llm_perf_leaderboard.py | 32 +++- 8 files changed, 86 insertions(+), 547 deletions(-) create mode 100644 llm_perf/hardware.yml delete mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_openvino.py delete mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_ort.py delete mode 100644 llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py rename llm_perf/hardware/{cuda => }/update_llm_perf_cuda_pytorch.py (100%) rename llm_perf/hardware/{intel => }/update_llm_perf_intel.py (100%) create mode 100644 llm_perf/hardware/utils.py diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml new file mode 100644 index 000000000..a69f35e33 --- /dev/null +++ b/llm_perf/hardware.yml @@ -0,0 +1,43 @@ +- machine: 1xA10 + description: A10-24GB-150W 🖥️ + type: cuda + subsets: + - unquantized + - awq + - bnb + - gptq + backends: + - pytorch + +- machine: 1xA100 + description: A100-80GB-275W 🖥️ + type: cuda + subsets: + - unquantized + - awq + - bnb + - gptq + backends: + - pytorch + +- machine: 1xT4 + description: T4-16GB-70W 🖥️ + type: cuda + subsets: + - unquantized + - awq + - bnb + - gptq + backends: + - pytorch + +- machine: c7i + description: 4th-Gen-Intel-Xeon-385W 🖥️ + type: intel + subsets: + - unquantized + - awq + - bnb + - gptq + backends: + - pytorch \ No newline at end of file diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py b/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py deleted file mode 100644 index 869598285..000000000 --- a/llm_perf/hardware/intel/update_llm_perf_intel_openvino.py +++ /dev/null @@ -1,177 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, - is_benchmark_supported, -) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, OVConfig, ProcessConfig -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -HARDWARE = "intel" - - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = "optimum-benchmark/llm-perf-openvino-intel-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-openvino-intel-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa"] -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -elif SUBSET == "bnb": - WEIGHTS_CONFIGS = { - # bnb - "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, - "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, - } -elif SUBSET == "gptq": - WEIGHTS_CONFIGS = { - # gptq - "4bit-gptq-exllama-v1": { - "quant_scheme": "gptq", - "torch_dtype": "float16", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, - }, - "4bit-gptq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "gptq", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, - }, - } -elif SUBSET == "awq": - WEIGHTS_CONFIGS = { - # awq - "4bit-awq-gemm": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemm"}, - }, - "4bit-awq-gemv": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemv"}, - }, - "4bit-awq-exllama-v1": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - "4bit-awq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - } - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def benchmark_intel_openvino(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig() - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - backend_config = OVConfig( - model=model, - device="cpu", - device_ids="0", - no_weights=True, - library="transformers", - task="text-generation", - quantization_config=quant_config, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_intel_openvino(model, attn_implementation, weights_config) diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py b/llm_perf/hardware/intel/update_llm_perf_intel_ort.py deleted file mode 100644 index 5b8f5c3eb..000000000 --- a/llm_perf/hardware/intel/update_llm_perf_intel_ort.py +++ /dev/null @@ -1,180 +0,0 @@ -import os -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, - is_benchmark_supported, -) -from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ORTConfig, ProcessConfig -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -HARDWARE = "intel" - - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = "optimum-benchmark/llm-perf-ort-intel-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-ort-intel-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa"] -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -elif SUBSET == "bnb": - WEIGHTS_CONFIGS = { - # bnb - "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, - "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, - } -elif SUBSET == "gptq": - WEIGHTS_CONFIGS = { - # gptq - "4bit-gptq-exllama-v1": { - "quant_scheme": "gptq", - "torch_dtype": "float16", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, - }, - "4bit-gptq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "gptq", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, - }, - } -elif SUBSET == "awq": - WEIGHTS_CONFIGS = { - # awq - "4bit-awq-gemm": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemm"}, - }, - "4bit-awq-gemv": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemv"}, - }, - "4bit-awq-exllama-v1": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - "4bit-awq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - } - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def benchmark_intel_ort(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig() - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - backend_config = ORTConfig( - model=model, - device="cpu", - device_ids="0", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - # quantization_scheme=quant_scheme, - quantization_config=quant_config, - # attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - # try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - # except Exception: - # LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - # benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - # benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - # benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - # benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_intel_ort(model, attn_implementation, weights_config) diff --git a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py deleted file mode 100644 index 797cfb2f6..000000000 --- a/llm_perf/hardware/intel/update_llm_perf_intel_pytorch.py +++ /dev/null @@ -1,181 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, - is_benchmark_supported, -) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -HARDWARE = "intel" - - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-intel-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-intel-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa"] -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -elif SUBSET == "bnb": - WEIGHTS_CONFIGS = { - # bnb - "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, - "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, - } -elif SUBSET == "gptq": - WEIGHTS_CONFIGS = { - # gptq - "4bit-gptq-exllama-v1": { - "quant_scheme": "gptq", - "torch_dtype": "float16", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, - }, - "4bit-gptq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "gptq", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, - }, - } -elif SUBSET == "awq": - WEIGHTS_CONFIGS = { - # awq - "4bit-awq-gemm": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemm"}, - }, - "4bit-awq-gemv": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemv"}, - }, - "4bit-awq-exllama-v1": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - "4bit-awq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - } - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def benchmark_intel_pytorch(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig() - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - backend_config = PyTorchConfig( - model=model, - device="cpu", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_intel_pytorch(model, attn_implementation, weights_config) diff --git a/llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware/update_llm_perf_cuda_pytorch.py similarity index 100% rename from llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py rename to llm_perf/hardware/update_llm_perf_cuda_pytorch.py diff --git a/llm_perf/hardware/intel/update_llm_perf_intel.py b/llm_perf/hardware/update_llm_perf_intel.py similarity index 100% rename from llm_perf/hardware/intel/update_llm_perf_intel.py rename to llm_perf/hardware/update_llm_perf_intel.py diff --git a/llm_perf/hardware/utils.py b/llm_perf/hardware/utils.py new file mode 100644 index 000000000..a8cfbfc6d --- /dev/null +++ b/llm_perf/hardware/utils.py @@ -0,0 +1,20 @@ +from typing import Any, Dict, List +import yaml + +class HardwareConfig: + def __init__(self, data: Dict[str, Any]): + self.machine = data['machine'] + self.description = data['description'] + self.type = data['type'] + self.subsets = data['subsets'] + self.backends = data['backends'] + + + def __repr__(self): + return f"HardwareConfig(machine='{self.machine}', description='{self.description}', " \ + f"type={self.type}, subsets={self.subsets}, backends={self.backends})" + +def load_hardware_configs(file_path: str) -> List[HardwareConfig]: + with open(file_path, 'r') as file: + data = yaml.safe_load(file) + return [HardwareConfig(config) for config in data] \ No newline at end of file diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 9c8763e63..d3e3e3650 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -5,18 +5,22 @@ from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm +from .hardware.utils import load_hardware_configs from optimum_benchmark import Benchmark REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" -PERF_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-{subset}-{machine}" +PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}" PERF_DF = "perf-df-{subset}-{machine}.csv" LLM_DF = "llm-df.csv" -def gather_benchmarks(subset: str, machine: str): - perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine) +def gather_benchmarks(subset: str, machine: str, backend: str, hardware_type: str): + """ + Gather the benchmarks for a given subset and machine + """ + perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_type=hardware_type) snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"]) dfs = [] @@ -31,12 +35,19 @@ def gather_benchmarks(subset: str, machine: str): def update_perf_dfs(): - for subset in ["unquantized", "bnb", "awq", "gptq"]: - for machine in ["1xA10", "1xA100", "1xT4"]: - try: - gather_benchmarks(subset, machine) - except Exception: - print(f"Subset {subset} for machine {machine} not found") + """ + Update the performance dataframes for all subsets and machines + """ + hardware_configs = load_hardware_configs("hardware.yml") + + + for hardware_config in hardware_configs: + for subset in hardware_config.subsets: + for backend in hardware_config.backends: + try: + gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.type) + except Exception: + print(f"Subset {subset} for machine {hardware_config.machine} not found") scrapping_script = """ @@ -48,6 +59,9 @@ def update_perf_dfs(): def update_llm_df(): + """ + Scrape the open-llm-leaderboard and update the leaderboard dataframe + """ subprocess.run(scrapping_script, shell=True) create_repo(repo_id=MAIN_REPO_ID, repo_type=REPO_TYPE, exist_ok=True, private=False) upload_file( From f93cc7c0986758e2ec435d8fb2499473392a543d Mon Sep 17 00:00:00 2001 From: baptiste Date: Wed, 4 Sep 2024 10:09:40 +0000 Subject: [PATCH 24/73] update leaderboard collection to support more hardware --- llm_perf/hardware/utils.py | 24 ++++++++++++++---------- llm_perf/update_llm_perf_leaderboard.py | 4 ++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/llm_perf/hardware/utils.py b/llm_perf/hardware/utils.py index a8cfbfc6d..bf24f7f3c 100644 --- a/llm_perf/hardware/utils.py +++ b/llm_perf/hardware/utils.py @@ -1,20 +1,24 @@ from typing import Any, Dict, List + import yaml + class HardwareConfig: def __init__(self, data: Dict[str, Any]): - self.machine = data['machine'] - self.description = data['description'] - self.type = data['type'] - self.subsets = data['subsets'] - self.backends = data['backends'] - + self.machine = data["machine"] + self.description = data["description"] + self.type = data["type"] + self.subsets = data["subsets"] + self.backends = data["backends"] def __repr__(self): - return f"HardwareConfig(machine='{self.machine}', description='{self.description}', " \ - f"type={self.type}, subsets={self.subsets}, backends={self.backends})" + return ( + f"HardwareConfig(machine='{self.machine}', description='{self.description}', " + f"type={self.type}, subsets={self.subsets}, backends={self.backends})" + ) + def load_hardware_configs(file_path: str) -> List[HardwareConfig]: - with open(file_path, 'r') as file: + with open(file_path, "r") as file: data = yaml.safe_load(file) - return [HardwareConfig(config) for config in data] \ No newline at end of file + return [HardwareConfig(config) for config in data] diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index d3e3e3650..27b2a0812 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -5,9 +5,10 @@ from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm -from .hardware.utils import load_hardware_configs from optimum_benchmark import Benchmark +from .hardware.utils import load_hardware_configs + REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}" @@ -40,7 +41,6 @@ def update_perf_dfs(): """ hardware_configs = load_hardware_configs("hardware.yml") - for hardware_config in hardware_configs: for subset in hardware_config.subsets: for backend in hardware_config.backends: From 2fad5931f38f719151d2067955488a7a842822ea Mon Sep 17 00:00:00 2001 From: baptiste Date: Wed, 4 Sep 2024 10:11:23 +0000 Subject: [PATCH 25/73] update leaderboard collection to support more hardware --- llm_perf/update_llm_perf_leaderboard.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 27b2a0812..544fc4b2a 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -2,13 +2,12 @@ from glob import glob import pandas as pd +from hardware.utils import load_hardware_configs from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm from optimum_benchmark import Benchmark -from .hardware.utils import load_hardware_configs - REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}" From 6f2885c1835be9d25b9378805579d3f97b53755e Mon Sep 17 00:00:00 2001 From: baptiste Date: Wed, 4 Sep 2024 10:16:22 +0000 Subject: [PATCH 26/73] update leaderboard collection to support more hardware --- llm_perf/update_llm_perf_leaderboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 544fc4b2a..1005cb3ad 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -38,7 +38,7 @@ def update_perf_dfs(): """ Update the performance dataframes for all subsets and machines """ - hardware_configs = load_hardware_configs("hardware.yml") + hardware_configs = load_hardware_configs("llm_perf/hardware.yml") for hardware_config in hardware_configs: for subset in hardware_config.subsets: From a59e55403215e3a4c734426276f5a4097a981886 Mon Sep 17 00:00:00 2001 From: baptiste Date: Wed, 4 Sep 2024 11:12:56 +0000 Subject: [PATCH 27/73] update leaderboard collection to support more hardware --- llm_perf/hardware.yml | 8 ++++---- llm_perf/hardware/utils.py | 4 ++-- llm_perf/update_llm_perf_leaderboard.py | 9 +++------ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml index a69f35e33..0d9581793 100644 --- a/llm_perf/hardware.yml +++ b/llm_perf/hardware.yml @@ -1,6 +1,6 @@ - machine: 1xA10 description: A10-24GB-150W 🖥️ - type: cuda + hardware_type: cuda subsets: - unquantized - awq @@ -11,7 +11,7 @@ - machine: 1xA100 description: A100-80GB-275W 🖥️ - type: cuda + hardware_type: cuda subsets: - unquantized - awq @@ -22,7 +22,7 @@ - machine: 1xT4 description: T4-16GB-70W 🖥️ - type: cuda + hardware_type: cuda subsets: - unquantized - awq @@ -33,7 +33,7 @@ - machine: c7i description: 4th-Gen-Intel-Xeon-385W 🖥️ - type: intel + hardware_type: intel subsets: - unquantized - awq diff --git a/llm_perf/hardware/utils.py b/llm_perf/hardware/utils.py index bf24f7f3c..71c9ce706 100644 --- a/llm_perf/hardware/utils.py +++ b/llm_perf/hardware/utils.py @@ -7,14 +7,14 @@ class HardwareConfig: def __init__(self, data: Dict[str, Any]): self.machine = data["machine"] self.description = data["description"] - self.type = data["type"] + self.hardware_type = data["hardware_type"] self.subsets = data["subsets"] self.backends = data["backends"] def __repr__(self): return ( f"HardwareConfig(machine='{self.machine}', description='{self.description}', " - f"type={self.type}, subsets={self.subsets}, backends={self.backends})" + f"hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})" ) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 1005cb3ad..283fc6330 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -10,7 +10,7 @@ REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" -PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}" +PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_type}-{subset}-{machine}" PERF_DF = "perf-df-{subset}-{machine}.csv" LLM_DF = "llm-df.csv" @@ -43,10 +43,7 @@ def update_perf_dfs(): for hardware_config in hardware_configs: for subset in hardware_config.subsets: for backend in hardware_config.backends: - try: - gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.type) - except Exception: - print(f"Subset {subset} for machine {hardware_config.machine} not found") + gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type) scrapping_script = """ @@ -69,5 +66,5 @@ def update_llm_df(): if __name__ == "__main__": - update_llm_df() + # update_llm_df() update_perf_dfs() From a1937482b48f745078b798e32cc8f54b5e8b04cc Mon Sep 17 00:00:00 2001 From: baptiste Date: Wed, 4 Sep 2024 11:14:29 +0000 Subject: [PATCH 28/73] update leaderboard collection to support more hardware --- llm_perf/update_llm_perf_leaderboard.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 283fc6330..f0acd4751 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -18,7 +18,7 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware_type: str): """ - Gather the benchmarks for a given subset and machine + Gather the benchmarks for a given machine """ perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_type=hardware_type) snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"]) @@ -36,7 +36,7 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware_type: st def update_perf_dfs(): """ - Update the performance dataframes for all subsets and machines + Update the performance dataframes for all machines """ hardware_configs = load_hardware_configs("llm_perf/hardware.yml") @@ -66,5 +66,5 @@ def update_llm_df(): if __name__ == "__main__": - # update_llm_df() + update_llm_df() update_perf_dfs() From 31f1ff66be19fd8d3ae41e5110af01bd372bfef5 Mon Sep 17 00:00:00 2001 From: baptiste Date: Wed, 4 Sep 2024 11:26:10 +0000 Subject: [PATCH 29/73] update leaderboard collection to support more hardware --- llm_perf/update_llm_perf_leaderboard.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index f0acd4751..0f4964156 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -43,7 +43,10 @@ def update_perf_dfs(): for hardware_config in hardware_configs: for subset in hardware_config.subsets: for backend in hardware_config.backends: - gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type) + try: + gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type) + except Exception as e: + print(f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_type} and {subset} with {backend}: {e}") scrapping_script = """ From 0f041fbba3a1824cb6389dd9c643bd98dcccd2e4 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 08:33:18 +0000 Subject: [PATCH 30/73] add new workflow --- ...ml => update_llm_perf_intel_openvino.yaml} | 0 .../update_llm_perf_intel_pytorch.yaml | 56 +++++++++++++++++++ llm_perf/hardware.yml | 7 +-- 3 files changed, 59 insertions(+), 4 deletions(-) rename .github/workflows/{update_llm_perf_intel_pytorch.yml => update_llm_perf_intel_openvino.yaml} (100%) create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_openvino.yaml similarity index 100% rename from .github/workflows/update_llm_perf_intel_pytorch.yml rename to .github/workflows/update_llm_perf_intel_openvino.yaml diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml new file mode 100644 index 000000000..c972a8869 --- /dev/null +++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml @@ -0,0 +1,56 @@ +name: Update LLM Perf Benchmarks - Intel PyTorch + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +env: + IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu + +jobs: + run_benchmarks: + strategy: + fail-fast: false + matrix: + backend: [pytorch, onnxruntime, openvino] + subset: [unquantized] + machine: [ + {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, + ] + + runs-on: ${{ matrix.machine.runs-on }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run benchmarks + uses: addnab/docker-run-action@v3 + env: + BACKEND: ${{ matrix.backend }} + SUBSET: ${{ matrix.subset }} + MACHINE: ${{ matrix.machine.name }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + with: + image: ${{ env.IMAGE }} + options: | + --rm + --shm-size 64G + --env BACKEND + --env SUBSET + --env MACHINE + --env HF_TOKEN + --env MKL_THREADING_LAYER=GNU + --env HF_HUB_ENABLE_HF_TRANSFER=1 + --volume ${{ github.workspace }}:/workspace + --workdir /workspace + run: | + pip install packaging && pip install einops scipy optimum codecarbon + pip install -U transformers huggingface_hub[hf_transfer] + pip install -e .[onnxruntime,openvino] + python llm_perf/hardware/intel/update_llm_perf_intel.py diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml index 0d9581793..d4a6bfb56 100644 --- a/llm_perf/hardware.yml +++ b/llm_perf/hardware.yml @@ -36,8 +36,7 @@ hardware_type: intel subsets: - unquantized - - awq - - bnb - - gptq backends: - - pytorch \ No newline at end of file + - pytorch + - onnxruntime + - openvino \ No newline at end of file From b2330b065170c556c14fd49f79cf59313e21c4e5 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 08:35:26 +0000 Subject: [PATCH 31/73] add new workflow --- .github/workflows/update_llm_perf_intel_openvino.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml index c972a8869..8558b899e 100644 --- a/.github/workflows/update_llm_perf_intel_openvino.yaml +++ b/.github/workflows/update_llm_perf_intel_openvino.yaml @@ -1,4 +1,4 @@ -name: Update LLM Perf Benchmarks - Intel PyTorch +name: Update LLM Perf Benchmarks - Intel Openvino on: workflow_dispatch: From 2f54e2dde6c071e68819be5680cf307bd11a2f6f Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 08:36:13 +0000 Subject: [PATCH 32/73] add new workflow --- llm_perf/update_llm_perf_leaderboard.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 0f4964156..e8308d59b 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -46,7 +46,9 @@ def update_perf_dfs(): try: gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type) except Exception as e: - print(f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_type} and {subset} with {backend}: {e}") + print( + f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_type} and {subset} with {backend}: {e}" + ) scrapping_script = """ From 8603b682a88d0b080ccb97599eb8edb4919b30f0 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 08:38:18 +0000 Subject: [PATCH 33/73] add new workflow --- .../update_llm_perf_intel_openvino.yaml | 56 ------------------- 1 file changed, 56 deletions(-) delete mode 100644 .github/workflows/update_llm_perf_intel_openvino.yaml diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml deleted file mode 100644 index 8558b899e..000000000 --- a/.github/workflows/update_llm_perf_intel_openvino.yaml +++ /dev/null @@ -1,56 +0,0 @@ -name: Update LLM Perf Benchmarks - Intel Openvino - -on: - workflow_dispatch: - schedule: - - cron: "0 0 * * *" - -concurrency: - cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} - -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu - -jobs: - run_benchmarks: - strategy: - fail-fast: false - matrix: - backend: [pytorch, onnxruntime, openvino] - subset: [unquantized] - machine: [ - {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, - ] - - runs-on: ${{ matrix.machine.runs-on }} - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Run benchmarks - uses: addnab/docker-run-action@v3 - env: - BACKEND: ${{ matrix.backend }} - SUBSET: ${{ matrix.subset }} - MACHINE: ${{ matrix.machine.name }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} - with: - image: ${{ env.IMAGE }} - options: | - --rm - --shm-size 64G - --env BACKEND - --env SUBSET - --env MACHINE - --env HF_TOKEN - --env MKL_THREADING_LAYER=GNU - --env HF_HUB_ENABLE_HF_TRANSFER=1 - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install packaging && pip install einops scipy optimum codecarbon - pip install -U transformers huggingface_hub[hf_transfer] - pip install -e .[onnxruntime,openvino] - python llm_perf/hardware/intel/update_llm_perf_intel.py From ec829cb70765433a61336ee46634ae866ed0cde6 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 08:38:36 +0000 Subject: [PATCH 34/73] add new workflow --- .../update_llm_perf_intel_pytorch.yaml | 56 ------------------- 1 file changed, 56 deletions(-) delete mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml deleted file mode 100644 index c972a8869..000000000 --- a/.github/workflows/update_llm_perf_intel_pytorch.yaml +++ /dev/null @@ -1,56 +0,0 @@ -name: Update LLM Perf Benchmarks - Intel PyTorch - -on: - workflow_dispatch: - schedule: - - cron: "0 0 * * *" - -concurrency: - cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} - -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu - -jobs: - run_benchmarks: - strategy: - fail-fast: false - matrix: - backend: [pytorch, onnxruntime, openvino] - subset: [unquantized] - machine: [ - {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, - ] - - runs-on: ${{ matrix.machine.runs-on }} - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Run benchmarks - uses: addnab/docker-run-action@v3 - env: - BACKEND: ${{ matrix.backend }} - SUBSET: ${{ matrix.subset }} - MACHINE: ${{ matrix.machine.name }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} - with: - image: ${{ env.IMAGE }} - options: | - --rm - --shm-size 64G - --env BACKEND - --env SUBSET - --env MACHINE - --env HF_TOKEN - --env MKL_THREADING_LAYER=GNU - --env HF_HUB_ENABLE_HF_TRANSFER=1 - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install packaging && pip install einops scipy optimum codecarbon - pip install -U transformers huggingface_hub[hf_transfer] - pip install -e .[onnxruntime,openvino] - python llm_perf/hardware/intel/update_llm_perf_intel.py From d152c8130e0f36acab878caa1dd0e77264fd97d7 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 08:45:17 +0000 Subject: [PATCH 35/73] add new workflow --- .../update_llm_perf_intel_openvino.yaml | 56 +++++++++++++++++++ .../update_llm_perf_intel_pytorch.yml | 56 +++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 .github/workflows/update_llm_perf_intel_openvino.yaml create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yml diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml new file mode 100644 index 000000000..8558b899e --- /dev/null +++ b/.github/workflows/update_llm_perf_intel_openvino.yaml @@ -0,0 +1,56 @@ +name: Update LLM Perf Benchmarks - Intel Openvino + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +env: + IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu + +jobs: + run_benchmarks: + strategy: + fail-fast: false + matrix: + backend: [pytorch, onnxruntime, openvino] + subset: [unquantized] + machine: [ + {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, + ] + + runs-on: ${{ matrix.machine.runs-on }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run benchmarks + uses: addnab/docker-run-action@v3 + env: + BACKEND: ${{ matrix.backend }} + SUBSET: ${{ matrix.subset }} + MACHINE: ${{ matrix.machine.name }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + with: + image: ${{ env.IMAGE }} + options: | + --rm + --shm-size 64G + --env BACKEND + --env SUBSET + --env MACHINE + --env HF_TOKEN + --env MKL_THREADING_LAYER=GNU + --env HF_HUB_ENABLE_HF_TRANSFER=1 + --volume ${{ github.workspace }}:/workspace + --workdir /workspace + run: | + pip install packaging && pip install einops scipy optimum codecarbon + pip install -U transformers huggingface_hub[hf_transfer] + pip install -e .[onnxruntime,openvino] + python llm_perf/hardware/intel/update_llm_perf_intel.py diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml new file mode 100644 index 000000000..c972a8869 --- /dev/null +++ b/.github/workflows/update_llm_perf_intel_pytorch.yml @@ -0,0 +1,56 @@ +name: Update LLM Perf Benchmarks - Intel PyTorch + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +env: + IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu + +jobs: + run_benchmarks: + strategy: + fail-fast: false + matrix: + backend: [pytorch, onnxruntime, openvino] + subset: [unquantized] + machine: [ + {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, + ] + + runs-on: ${{ matrix.machine.runs-on }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run benchmarks + uses: addnab/docker-run-action@v3 + env: + BACKEND: ${{ matrix.backend }} + SUBSET: ${{ matrix.subset }} + MACHINE: ${{ matrix.machine.name }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + with: + image: ${{ env.IMAGE }} + options: | + --rm + --shm-size 64G + --env BACKEND + --env SUBSET + --env MACHINE + --env HF_TOKEN + --env MKL_THREADING_LAYER=GNU + --env HF_HUB_ENABLE_HF_TRANSFER=1 + --volume ${{ github.workspace }}:/workspace + --workdir /workspace + run: | + pip install packaging && pip install einops scipy optimum codecarbon + pip install -U transformers huggingface_hub[hf_transfer] + pip install -e .[onnxruntime,openvino] + python llm_perf/hardware/intel/update_llm_perf_intel.py From 9730b0bd398eeaf203b9af46e851199527f861ce Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 09:05:41 +0000 Subject: [PATCH 36/73] add new workflow --- .github/workflows/update_llm_perf_intel_openvino.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml index 8558b899e..943d16d8f 100644 --- a/.github/workflows/update_llm_perf_intel_openvino.yaml +++ b/.github/workflows/update_llm_perf_intel_openvino.yaml @@ -2,6 +2,7 @@ name: Update LLM Perf Benchmarks - Intel Openvino on: workflow_dispatch: + push: schedule: - cron: "0 0 * * *" From 6e9d33c940ad88322457bf240d826b638b554ba7 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 10:03:13 +0000 Subject: [PATCH 37/73] add new workflow --- .../update_llm_perf_cuda_pytorch.yaml | 2 +- .../update_llm_perf_intel_openvino.yaml | 57 ----- .../update_llm_perf_intel_pytorch.yml | 56 ----- llm_perf/hardware.yml | 12 +- .../hardware/update_llm_perf_cuda_pytorch.py | 182 --------------- llm_perf/hardware/update_llm_perf_intel.py | 219 ------------------ llm_perf/hardware/utils.py | 24 -- llm_perf/update_llm_perf_leaderboard.py | 2 +- llm_perf/utils.py | 32 +++ 9 files changed, 42 insertions(+), 544 deletions(-) delete mode 100644 .github/workflows/update_llm_perf_intel_openvino.yaml delete mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yml delete mode 100644 llm_perf/hardware/update_llm_perf_cuda_pytorch.py delete mode 100644 llm_perf/hardware/update_llm_perf_intel.py delete mode 100644 llm_perf/hardware/utils.py diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index 567128e5c..2597d7389 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -53,4 +53,4 @@ jobs: pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . - python llm_perf/hardware/cuda/update_llm_perf_cuda_pytorch.py + python llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py diff --git a/.github/workflows/update_llm_perf_intel_openvino.yaml b/.github/workflows/update_llm_perf_intel_openvino.yaml deleted file mode 100644 index 943d16d8f..000000000 --- a/.github/workflows/update_llm_perf_intel_openvino.yaml +++ /dev/null @@ -1,57 +0,0 @@ -name: Update LLM Perf Benchmarks - Intel Openvino - -on: - workflow_dispatch: - push: - schedule: - - cron: "0 0 * * *" - -concurrency: - cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} - -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu - -jobs: - run_benchmarks: - strategy: - fail-fast: false - matrix: - backend: [pytorch, onnxruntime, openvino] - subset: [unquantized] - machine: [ - {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, - ] - - runs-on: ${{ matrix.machine.runs-on }} - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Run benchmarks - uses: addnab/docker-run-action@v3 - env: - BACKEND: ${{ matrix.backend }} - SUBSET: ${{ matrix.subset }} - MACHINE: ${{ matrix.machine.name }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} - with: - image: ${{ env.IMAGE }} - options: | - --rm - --shm-size 64G - --env BACKEND - --env SUBSET - --env MACHINE - --env HF_TOKEN - --env MKL_THREADING_LAYER=GNU - --env HF_HUB_ENABLE_HF_TRANSFER=1 - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install packaging && pip install einops scipy optimum codecarbon - pip install -U transformers huggingface_hub[hf_transfer] - pip install -e .[onnxruntime,openvino] - python llm_perf/hardware/intel/update_llm_perf_intel.py diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yml b/.github/workflows/update_llm_perf_intel_pytorch.yml deleted file mode 100644 index c972a8869..000000000 --- a/.github/workflows/update_llm_perf_intel_pytorch.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: Update LLM Perf Benchmarks - Intel PyTorch - -on: - workflow_dispatch: - schedule: - - cron: "0 0 * * *" - -concurrency: - cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} - -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu - -jobs: - run_benchmarks: - strategy: - fail-fast: false - matrix: - backend: [pytorch, onnxruntime, openvino] - subset: [unquantized] - machine: [ - {name: c7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, - ] - - runs-on: ${{ matrix.machine.runs-on }} - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Run benchmarks - uses: addnab/docker-run-action@v3 - env: - BACKEND: ${{ matrix.backend }} - SUBSET: ${{ matrix.subset }} - MACHINE: ${{ matrix.machine.name }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} - with: - image: ${{ env.IMAGE }} - options: | - --rm - --shm-size 64G - --env BACKEND - --env SUBSET - --env MACHINE - --env HF_TOKEN - --env MKL_THREADING_LAYER=GNU - --env HF_HUB_ENABLE_HF_TRANSFER=1 - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install packaging && pip install einops scipy optimum codecarbon - pip install -U transformers huggingface_hub[hf_transfer] - pip install -e .[onnxruntime,openvino] - python llm_perf/hardware/intel/update_llm_perf_intel.py diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml index d4a6bfb56..ac7f85b2d 100644 --- a/llm_perf/hardware.yml +++ b/llm_perf/hardware.yml @@ -1,6 +1,7 @@ - machine: 1xA10 description: A10-24GB-150W 🖥️ - hardware_type: cuda + hardware_provider: nvidia + hardware_type: gpu subsets: - unquantized - awq @@ -11,7 +12,8 @@ - machine: 1xA100 description: A100-80GB-275W 🖥️ - hardware_type: cuda + hardware_provider: nvidia + hardware_type: gpu subsets: - unquantized - awq @@ -22,7 +24,8 @@ - machine: 1xT4 description: T4-16GB-70W 🖥️ - hardware_type: cuda + hardware_provider: nvidia + hardware_type: gpu subsets: - unquantized - awq @@ -33,7 +36,8 @@ - machine: c7i description: 4th-Gen-Intel-Xeon-385W 🖥️ - hardware_type: intel + hardware_provider: intel + hardware_type: cpu subsets: - unquantized backends: diff --git a/llm_perf/hardware/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware/update_llm_perf_cuda_pytorch.py deleted file mode 100644 index 8b65e1f5c..000000000 --- a/llm_perf/hardware/update_llm_perf_cuda_pytorch.py +++ /dev/null @@ -1,182 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, - is_benchmark_supported, -) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -HARDWARE = "cuda" - - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"] -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -elif SUBSET == "bnb": - WEIGHTS_CONFIGS = { - # bnb - "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, - "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, - } -elif SUBSET == "gptq": - WEIGHTS_CONFIGS = { - # gptq - "4bit-gptq-exllama-v1": { - "quant_scheme": "gptq", - "torch_dtype": "float16", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, - }, - "4bit-gptq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "gptq", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, - }, - } -elif SUBSET == "awq": - WEIGHTS_CONFIGS = { - # awq - "4bit-awq-gemm": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemm"}, - }, - "4bit-awq-gemv": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemv"}, - }, - "4bit-awq-exllama-v1": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - "4bit-awq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - } - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def benchmark_cuda_pytorch(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill") - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - backend_config = PyTorchConfig( - model=model, - device="cuda", - device_ids="0", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_cuda_pytorch(model, attn_implementation, weights_config) diff --git a/llm_perf/hardware/update_llm_perf_intel.py b/llm_perf/hardware/update_llm_perf_intel.py deleted file mode 100644 index 5c760e1bd..000000000 --- a/llm_perf/hardware/update_llm_perf_intel.py +++ /dev/null @@ -1,219 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, - is_benchmark_supported, -) -from optimum_benchmark import ( - Benchmark, - BenchmarkConfig, - BenchmarkReport, - InferenceConfig, - ORTConfig, - OVConfig, - ProcessConfig, - PyTorchConfig, -) -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -BACKEND = os.getenv("BACKEND", None) -HARDWARE = "intel" - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa"] - - -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -elif SUBSET == "bnb": - WEIGHTS_CONFIGS = { - # bnb - "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, - "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, - } -elif SUBSET == "gptq": - WEIGHTS_CONFIGS = { - # gptq - "4bit-gptq-exllama-v1": { - "quant_scheme": "gptq", - "torch_dtype": "float16", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, - }, - "4bit-gptq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "gptq", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, - }, - } -elif SUBSET == "awq": - WEIGHTS_CONFIGS = { - # awq - "4bit-awq-gemm": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemm"}, - }, - "4bit-awq-gemv": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemv"}, - }, - "4bit-awq-exllama-v1": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - "4bit-awq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - } - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def benchmark_intel(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig() - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - - if BACKEND == "pytorch": - backend_config = PyTorchConfig( - model=model, - device="cpu", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - elif BACKEND == "onnxruntime": - backend_config = ORTConfig( - model=model, - device="cpu", - device_ids="0", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_config=quant_config, - model_kwargs={"trust_remote_code": True}, - ) - elif BACKEND == "openvino": - backend_config = OVConfig( - model=model, - device="cpu", - device_ids="0", - no_weights=True, - library="transformers", - task="text-generation", - quantization_config=quant_config, - model_kwargs={"trust_remote_code": True}, - ) - else: - raise ValueError(f"Unsupported backend: {BACKEND}") - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_intel(model, attn_implementation, weights_config) diff --git a/llm_perf/hardware/utils.py b/llm_perf/hardware/utils.py deleted file mode 100644 index 71c9ce706..000000000 --- a/llm_perf/hardware/utils.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Any, Dict, List - -import yaml - - -class HardwareConfig: - def __init__(self, data: Dict[str, Any]): - self.machine = data["machine"] - self.description = data["description"] - self.hardware_type = data["hardware_type"] - self.subsets = data["subsets"] - self.backends = data["backends"] - - def __repr__(self): - return ( - f"HardwareConfig(machine='{self.machine}', description='{self.description}', " - f"hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})" - ) - - -def load_hardware_configs(file_path: str) -> List[HardwareConfig]: - with open(file_path, "r") as file: - data = yaml.safe_load(file) - return [HardwareConfig(config) for config in data] diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index e8308d59b..caaf91ec2 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -2,7 +2,7 @@ from glob import glob import pandas as pd -from hardware.utils import load_hardware_configs +from utils import load_hardware_configs from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm diff --git a/llm_perf/utils.py b/llm_perf/utils.py index ea25cdab3..8a5c43a83 100644 --- a/llm_perf/utils.py +++ b/llm_perf/utils.py @@ -1,4 +1,8 @@ +from enum import Enum, auto +from typing import Any, Dict, List + import pandas as pd +import yaml from optimum_benchmark.benchmark.report import BenchmarkReport @@ -146,3 +150,31 @@ def is_benchmark_supported(weights_config, attn_implementation, hardware): return False return True + + +class HardwareType(Enum): + CPU = auto() + GPU = auto() + + +class HardwareConfig: + def __init__(self, data: Dict[str, Any]): + self.machine = data["machine"] + self.description = data["description"] + self.hardware_provider = data["hardware provider"] + self.hardware_type = data["hardware type"] + assert self.hardware_type in HardwareType, f"Hardware type {self.hardware_type} not supported" + self.subsets = data["subsets"] + self.backends = data["backends"] + + def __repr__(self): + return ( + f"HardwareConfig(machine='{self.machine}', description='{self.description}', " + f"hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})" + ) + + +def load_hardware_configs(file_path: str) -> List[HardwareConfig]: + with open(file_path, "r") as file: + data = yaml.safe_load(file) + return [HardwareConfig(config) for config in data] From 540af0a0e82788ab80d930c997034fcc13cc30b5 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 10:04:55 +0000 Subject: [PATCH 38/73] add new workflow --- llm_perf/update_llm_perf_leaderboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index caaf91ec2..af224c06d 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -2,9 +2,9 @@ from glob import glob import pandas as pd -from utils import load_hardware_configs from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm +from utils import load_hardware_configs from optimum_benchmark import Benchmark From 452e4b0b365b7480fb8becfaffc6a9f7bce7c814 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 5 Sep 2024 10:06:53 +0000 Subject: [PATCH 39/73] add new workflow --- .../update_llm_perf_intel_pytorch.yaml | 54 ++++++ .../intel/update_llm_perf_intel_pytorch.py | 141 ++++++++++++++ .../nvidia/update_llm_perf_cuda_pytorch.py | 182 ++++++++++++++++++ 3 files changed, 377 insertions(+) create mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml create mode 100644 llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py create mode 100644 llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml new file mode 100644 index 000000000..3cf6bd9bd --- /dev/null +++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml @@ -0,0 +1,54 @@ +name: Update LLM Perf Benchmarks - Intel PyTorch + +on: + workflow_dispatch: + push: + schedule: + - cron: "0 0 * * *" + +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +env: + IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu + +jobs: + run_benchmarks: + strategy: + fail-fast: false + matrix: + subset: [unquantized] + machine: [ + {name: 32vCPU-C7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, + ] + + runs-on: ${{ matrix.machine.runs-on }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run benchmarks + uses: addnab/docker-run-action@v3 + env: + SUBSET: ${{ matrix.subset }} + MACHINE: ${{ matrix.machine.name }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + with: + image: ${{ env.IMAGE }} + options: | + --rm + --shm-size 64G + --env SUBSET + --env MACHINE + --env HF_TOKEN + --env MKL_THREADING_LAYER=GNU + --env HF_HUB_ENABLE_HF_TRANSFER=1 + --volume ${{ github.workspace }}:/workspace + --workdir /workspace + run: | + pip install packaging && pip install einops scipy optimum codecarbon + pip install -U transformers huggingface_hub[hf_transfer] + pip install -e .[onnxruntime,openvino] + python llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py diff --git a/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py new file mode 100644 index 000000000..3b7dfe781 --- /dev/null +++ b/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py @@ -0,0 +1,141 @@ +import os +import traceback +from itertools import product +from logging import getLogger + +from llm_perf.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + GENERATE_KWARGS, + INPUT_SHAPES, + OPEN_LLM_LIST, + PRETRAINED_OPEN_LLM_LIST, + is_benchmark_conducted, + is_benchmark_supported, +) +from optimum_benchmark import ( + Benchmark, + BenchmarkConfig, + BenchmarkReport, + InferenceConfig, + ProcessConfig, + PyTorchConfig, +) +from optimum_benchmark.logging_utils import setup_logging + +SUBSET = os.getenv("SUBSET", None) +MACHINE = os.getenv("MACHINE", None) +BACKEND = "pytorch" +HARDWARE = "intel" + +if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug" + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] + SUBSET = "unquantized" +elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}" +else: + raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") + +ATTENTION_CONFIGS = ["eager", "sdpa"] + + +if SUBSET == "unquantized": + WEIGHTS_CONFIGS = { + # unquantized + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } +else: + raise ValueError(f"Subset {SUBSET} not supported") + + +LOGGER = getLogger("llm-perf-backend") +LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") +LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") +LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") + + +def benchmark_intel(model, attn_implementation, weights_config): + benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] + quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] + quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] + + if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") + return + + if is_benchmark_conducted(PUSH_REPO_ID, subfolder): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") + return + + launcher_config = ProcessConfig() + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + + backend_config = PyTorchConfig( + model=model, + device="cpu", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_scheme=quant_scheme, + quantization_config=quant_config, + attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + + benchmark_config = BenchmarkConfig( + name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config + ) + + benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + try: + LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + except Exception: + LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") + benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + +if __name__ == "__main__": + # for isolated process + os.environ["LOG_TO_FILE"] = "0" + os.environ["LOG_LEVEL"] = "INFO" + + # for main process + setup_logging(level="INFO", prefix="MAIN-PROCESS") + + models_attentions_weights = list( + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) + ) + + LOGGER.info( + f"Running a total of {len(models_attentions_weights)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " + f"{len(ATTENTION_CONFIGS)} attentions implementations " + f"and {len(WEIGHTS_CONFIGS)} weights configurations." + ) + + for model, attn_implementation, weights_config in models_attentions_weights: + benchmark_intel(model, attn_implementation, weights_config) diff --git a/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py new file mode 100644 index 000000000..8b65e1f5c --- /dev/null +++ b/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py @@ -0,0 +1,182 @@ +import os +import traceback +from itertools import product +from logging import getLogger + +from llm_perf.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + GENERATE_KWARGS, + INPUT_SHAPES, + OPEN_LLM_LIST, + PRETRAINED_OPEN_LLM_LIST, + is_benchmark_conducted, + is_benchmark_supported, +) +from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig +from optimum_benchmark.logging_utils import setup_logging + +SUBSET = os.getenv("SUBSET", None) +MACHINE = os.getenv("MACHINE", None) +HARDWARE = "cuda" + + +if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: + PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug" + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] + SUBSET = "unquantized" +elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}" +else: + raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") + +ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"] +if SUBSET == "unquantized": + WEIGHTS_CONFIGS = { + # unquantized + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } +elif SUBSET == "bnb": + WEIGHTS_CONFIGS = { + # bnb + "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, + "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, + } +elif SUBSET == "gptq": + WEIGHTS_CONFIGS = { + # gptq + "4bit-gptq-exllama-v1": { + "quant_scheme": "gptq", + "torch_dtype": "float16", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, + }, + "4bit-gptq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, + }, + } +elif SUBSET == "awq": + WEIGHTS_CONFIGS = { + # awq + "4bit-awq-gemm": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemm"}, + }, + "4bit-awq-gemv": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemv"}, + }, + "4bit-awq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + "4bit-awq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + } + + +LOGGER = getLogger("llm-perf-backend") +LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") +LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") +LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") + + +def benchmark_cuda_pytorch(model, attn_implementation, weights_config): + benchmark_name = f"{weights_config}-{attn_implementation}" + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] + quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] + quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] + + if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") + return + + if is_benchmark_conducted(PUSH_REPO_ID, subfolder): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") + return + + launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill") + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + backend_config = PyTorchConfig( + model=model, + device="cuda", + device_ids="0", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_scheme=quant_scheme, + quantization_config=quant_config, + attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + + benchmark_config = BenchmarkConfig( + name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config + ) + + benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + try: + LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + except Exception: + LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") + benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + +if __name__ == "__main__": + # for isolated process + os.environ["LOG_TO_FILE"] = "0" + os.environ["LOG_LEVEL"] = "INFO" + + # for main process + setup_logging(level="INFO", prefix="MAIN-PROCESS") + + models_attentions_weights = list( + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) + ) + + LOGGER.info( + f"Running a total of {len(models_attentions_weights)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " + f"{len(ATTENTION_CONFIGS)} attentions implementations " + f"and {len(WEIGHTS_CONFIGS)} weights configurations." + ) + + for model, attn_implementation, weights_config in models_attentions_weights: + benchmark_cuda_pytorch(model, attn_implementation, weights_config) From b25d6e1196eb8e7668eaf150d5064bf0dfc380f7 Mon Sep 17 00:00:00 2001 From: baptiste Date: Fri, 6 Sep 2024 13:35:58 +0000 Subject: [PATCH 40/73] add new workflow --- .../update_llm_perf_cuda_pytorch.yaml | 2 +- .../update_llm_perf_intel_pytorch.yaml | 2 +- .../intel/update_llm_perf_intel_pytorch.py | 141 -------------- .../nvidia/update_llm_perf_cuda_pytorch.py | 182 ------------------ llm_perf/update_llm_perf_leaderboard.py | 15 +- 5 files changed, 12 insertions(+), 330 deletions(-) delete mode 100644 llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py delete mode 100644 llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index 2597d7389..78b2f1f2c 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -53,4 +53,4 @@ jobs: pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . - python llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py + python llm_perf/scripts/update_llm_perf_cuda_pytorch.py diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml index 3cf6bd9bd..9b44ab711 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yaml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml @@ -51,4 +51,4 @@ jobs: pip install packaging && pip install einops scipy optimum codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e .[onnxruntime,openvino] - python llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py + python llm_perf/scripts/update_llm_perf_cpu_pytorch.py diff --git a/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py b/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py deleted file mode 100644 index 3b7dfe781..000000000 --- a/llm_perf/hardware_provider/intel/update_llm_perf_intel_pytorch.py +++ /dev/null @@ -1,141 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, - is_benchmark_supported, -) -from optimum_benchmark import ( - Benchmark, - BenchmarkConfig, - BenchmarkReport, - InferenceConfig, - ProcessConfig, - PyTorchConfig, -) -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -BACKEND = "pytorch" -HARDWARE = "intel" - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa"] - - -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -else: - raise ValueError(f"Subset {SUBSET} not supported") - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def benchmark_intel(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig() - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - - backend_config = PyTorchConfig( - model=model, - device="cpu", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_intel(model, attn_implementation, weights_config) diff --git a/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py b/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py deleted file mode 100644 index 8b65e1f5c..000000000 --- a/llm_perf/hardware_provider/nvidia/update_llm_perf_cuda_pytorch.py +++ /dev/null @@ -1,182 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, - is_benchmark_supported, -) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -HARDWARE = "cuda" - - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"] -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -elif SUBSET == "bnb": - WEIGHTS_CONFIGS = { - # bnb - "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, - "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, - } -elif SUBSET == "gptq": - WEIGHTS_CONFIGS = { - # gptq - "4bit-gptq-exllama-v1": { - "quant_scheme": "gptq", - "torch_dtype": "float16", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, - }, - "4bit-gptq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "gptq", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, - }, - } -elif SUBSET == "awq": - WEIGHTS_CONFIGS = { - # awq - "4bit-awq-gemm": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemm"}, - }, - "4bit-awq-gemv": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemv"}, - }, - "4bit-awq-exllama-v1": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - "4bit-awq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - } - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def benchmark_cuda_pytorch(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill") - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - backend_config = PyTorchConfig( - model=model, - device="cuda", - device_ids="0", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_cuda_pytorch(model, attn_implementation, weights_config) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index af224c06d..26f054d08 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -10,17 +10,22 @@ REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" -PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_type}-{subset}-{machine}" +PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}" PERF_DF = "perf-df-{subset}-{machine}.csv" LLM_DF = "llm-df.csv" -def gather_benchmarks(subset: str, machine: str, backend: str, hardware_type: str): +def gather_benchmarks(subset: str, machine: str, backend: str, hardware_provider: str): """ Gather the benchmarks for a given machine """ - perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_type=hardware_type) + if hardware_provider == "nvidia": + hardware = "cuda" + else: + hardware = hardware_provider + + perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware=hardware) snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"]) dfs = [] @@ -44,10 +49,10 @@ def update_perf_dfs(): for subset in hardware_config.subsets: for backend in hardware_config.backends: try: - gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_type) + gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_provider) except Exception as e: print( - f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_type} and {subset} with {backend}: {e}" + f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_provider} and {subset} with {backend}: {e}" ) From a76e56dc196ab25533157cf90b377f617e7b6719 Mon Sep 17 00:00:00 2001 From: baptiste Date: Fri, 6 Sep 2024 13:37:56 +0000 Subject: [PATCH 41/73] add new workflow --- .../scripts/update_llm_perf_cpu_pytorch.py | 141 ++++++++++++++ .../scripts/update_llm_perf_cuda_pytorch.py | 182 ++++++++++++++++++ 2 files changed, 323 insertions(+) create mode 100644 llm_perf/scripts/update_llm_perf_cpu_pytorch.py create mode 100644 llm_perf/scripts/update_llm_perf_cuda_pytorch.py diff --git a/llm_perf/scripts/update_llm_perf_cpu_pytorch.py b/llm_perf/scripts/update_llm_perf_cpu_pytorch.py new file mode 100644 index 000000000..3b7dfe781 --- /dev/null +++ b/llm_perf/scripts/update_llm_perf_cpu_pytorch.py @@ -0,0 +1,141 @@ +import os +import traceback +from itertools import product +from logging import getLogger + +from llm_perf.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + GENERATE_KWARGS, + INPUT_SHAPES, + OPEN_LLM_LIST, + PRETRAINED_OPEN_LLM_LIST, + is_benchmark_conducted, + is_benchmark_supported, +) +from optimum_benchmark import ( + Benchmark, + BenchmarkConfig, + BenchmarkReport, + InferenceConfig, + ProcessConfig, + PyTorchConfig, +) +from optimum_benchmark.logging_utils import setup_logging + +SUBSET = os.getenv("SUBSET", None) +MACHINE = os.getenv("MACHINE", None) +BACKEND = "pytorch" +HARDWARE = "intel" + +if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug" + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] + SUBSET = "unquantized" +elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}" +else: + raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") + +ATTENTION_CONFIGS = ["eager", "sdpa"] + + +if SUBSET == "unquantized": + WEIGHTS_CONFIGS = { + # unquantized + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } +else: + raise ValueError(f"Subset {SUBSET} not supported") + + +LOGGER = getLogger("llm-perf-backend") +LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") +LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") +LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") + + +def benchmark_intel(model, attn_implementation, weights_config): + benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] + quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] + quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] + + if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") + return + + if is_benchmark_conducted(PUSH_REPO_ID, subfolder): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") + return + + launcher_config = ProcessConfig() + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + + backend_config = PyTorchConfig( + model=model, + device="cpu", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_scheme=quant_scheme, + quantization_config=quant_config, + attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + + benchmark_config = BenchmarkConfig( + name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config + ) + + benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + try: + LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + except Exception: + LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") + benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + +if __name__ == "__main__": + # for isolated process + os.environ["LOG_TO_FILE"] = "0" + os.environ["LOG_LEVEL"] = "INFO" + + # for main process + setup_logging(level="INFO", prefix="MAIN-PROCESS") + + models_attentions_weights = list( + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) + ) + + LOGGER.info( + f"Running a total of {len(models_attentions_weights)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " + f"{len(ATTENTION_CONFIGS)} attentions implementations " + f"and {len(WEIGHTS_CONFIGS)} weights configurations." + ) + + for model, attn_implementation, weights_config in models_attentions_weights: + benchmark_intel(model, attn_implementation, weights_config) diff --git a/llm_perf/scripts/update_llm_perf_cuda_pytorch.py b/llm_perf/scripts/update_llm_perf_cuda_pytorch.py new file mode 100644 index 000000000..8b65e1f5c --- /dev/null +++ b/llm_perf/scripts/update_llm_perf_cuda_pytorch.py @@ -0,0 +1,182 @@ +import os +import traceback +from itertools import product +from logging import getLogger + +from llm_perf.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + GENERATE_KWARGS, + INPUT_SHAPES, + OPEN_LLM_LIST, + PRETRAINED_OPEN_LLM_LIST, + is_benchmark_conducted, + is_benchmark_supported, +) +from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig +from optimum_benchmark.logging_utils import setup_logging + +SUBSET = os.getenv("SUBSET", None) +MACHINE = os.getenv("MACHINE", None) +HARDWARE = "cuda" + + +if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: + PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug" + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] + SUBSET = "unquantized" +elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}" +else: + raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") + +ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"] +if SUBSET == "unquantized": + WEIGHTS_CONFIGS = { + # unquantized + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } +elif SUBSET == "bnb": + WEIGHTS_CONFIGS = { + # bnb + "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, + "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, + } +elif SUBSET == "gptq": + WEIGHTS_CONFIGS = { + # gptq + "4bit-gptq-exllama-v1": { + "quant_scheme": "gptq", + "torch_dtype": "float16", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, + }, + "4bit-gptq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, + }, + } +elif SUBSET == "awq": + WEIGHTS_CONFIGS = { + # awq + "4bit-awq-gemm": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemm"}, + }, + "4bit-awq-gemv": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemv"}, + }, + "4bit-awq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + "4bit-awq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + } + + +LOGGER = getLogger("llm-perf-backend") +LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") +LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") +LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") + + +def benchmark_cuda_pytorch(model, attn_implementation, weights_config): + benchmark_name = f"{weights_config}-{attn_implementation}" + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] + quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] + quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] + + if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") + return + + if is_benchmark_conducted(PUSH_REPO_ID, subfolder): + LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") + return + + launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill") + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + backend_config = PyTorchConfig( + model=model, + device="cuda", + device_ids="0", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_scheme=quant_scheme, + quantization_config=quant_config, + attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + + benchmark_config = BenchmarkConfig( + name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config + ) + + benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + try: + LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + except Exception: + LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") + benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) + + +if __name__ == "__main__": + # for isolated process + os.environ["LOG_TO_FILE"] = "0" + os.environ["LOG_LEVEL"] = "INFO" + + # for main process + setup_logging(level="INFO", prefix="MAIN-PROCESS") + + models_attentions_weights = list( + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) + ) + + LOGGER.info( + f"Running a total of {len(models_attentions_weights)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " + f"{len(ATTENTION_CONFIGS)} attentions implementations " + f"and {len(WEIGHTS_CONFIGS)} weights configurations." + ) + + for model, attn_implementation, weights_config in models_attentions_weights: + benchmark_cuda_pytorch(model, attn_implementation, weights_config) From 6677def85c23af7eb3644c1e48b00df36d83a010 Mon Sep 17 00:00:00 2001 From: baptiste Date: Fri, 6 Sep 2024 14:03:53 +0000 Subject: [PATCH 42/73] add new workflow --- llm_perf/hardware.yml | 8 ++++---- llm_perf/{scripts => }/update_llm_perf_cpu_pytorch.py | 0 .../{scripts => }/update_llm_perf_cuda_pytorch.py | 0 llm_perf/update_llm_perf_leaderboard.py | 11 +++-------- 4 files changed, 7 insertions(+), 12 deletions(-) rename llm_perf/{scripts => }/update_llm_perf_cpu_pytorch.py (100%) rename llm_perf/{scripts => }/update_llm_perf_cuda_pytorch.py (100%) diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml index ac7f85b2d..49819a860 100644 --- a/llm_perf/hardware.yml +++ b/llm_perf/hardware.yml @@ -1,7 +1,7 @@ - machine: 1xA10 description: A10-24GB-150W 🖥️ hardware_provider: nvidia - hardware_type: gpu + hardware_backend: cuda subsets: - unquantized - awq @@ -13,7 +13,7 @@ - machine: 1xA100 description: A100-80GB-275W 🖥️ hardware_provider: nvidia - hardware_type: gpu + hardware_backend: cuda subsets: - unquantized - awq @@ -25,7 +25,7 @@ - machine: 1xT4 description: T4-16GB-70W 🖥️ hardware_provider: nvidia - hardware_type: gpu + hardware_backend: cuda subsets: - unquantized - awq @@ -37,7 +37,7 @@ - machine: c7i description: 4th-Gen-Intel-Xeon-385W 🖥️ hardware_provider: intel - hardware_type: cpu + hardware_backend: cpu subsets: - unquantized backends: diff --git a/llm_perf/scripts/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py similarity index 100% rename from llm_perf/scripts/update_llm_perf_cpu_pytorch.py rename to llm_perf/update_llm_perf_cpu_pytorch.py diff --git a/llm_perf/scripts/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py similarity index 100% rename from llm_perf/scripts/update_llm_perf_cuda_pytorch.py rename to llm_perf/update_llm_perf_cuda_pytorch.py diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 26f054d08..84461f45b 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -10,22 +10,17 @@ REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" -PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}" +PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_backend}-{subset}-{machine}" PERF_DF = "perf-df-{subset}-{machine}.csv" LLM_DF = "llm-df.csv" -def gather_benchmarks(subset: str, machine: str, backend: str, hardware_provider: str): +def gather_benchmarks(subset: str, machine: str, backend: str, hardware_backend: str): """ Gather the benchmarks for a given machine """ - if hardware_provider == "nvidia": - hardware = "cuda" - else: - hardware = hardware_provider - - perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware=hardware) + perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_backend=hardware_backend) snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"]) dfs = [] From 6593487c67b3159ad3f7a0b7bd655a05b5f0849d Mon Sep 17 00:00:00 2001 From: baptiste Date: Fri, 6 Sep 2024 14:04:39 +0000 Subject: [PATCH 43/73] add new workflow --- .github/workflows/update_llm_perf_cuda_pytorch.yaml | 2 +- .github/workflows/update_llm_perf_intel_pytorch.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index 78b2f1f2c..0ab646ab9 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -53,4 +53,4 @@ jobs: pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . - python llm_perf/scripts/update_llm_perf_cuda_pytorch.py + python llm_perf/update_llm_perf_cuda_pytorch.py diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml index 9b44ab711..6032182f2 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yaml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml @@ -51,4 +51,4 @@ jobs: pip install packaging && pip install einops scipy optimum codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e .[onnxruntime,openvino] - python llm_perf/scripts/update_llm_perf_cpu_pytorch.py + python llm_perf/update_llm_perf_cpu_pytorch.py From 9802c95e20e85866c10db23d9f56301679a477ae Mon Sep 17 00:00:00 2001 From: baptiste Date: Fri, 6 Sep 2024 14:09:00 +0000 Subject: [PATCH 44/73] add new workflow --- llm_perf/update_llm_perf_cpu_pytorch.py | 8 +++++++- llm_perf/update_llm_perf_cuda_pytorch.py | 12 ++++++++---- llm_perf/update_llm_perf_leaderboard.py | 4 +++- llm_perf/utils.py | 11 ----------- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py index 3b7dfe781..1e8cce1bc 100644 --- a/llm_perf/update_llm_perf_cpu_pytorch.py +++ b/llm_perf/update_llm_perf_cpu_pytorch.py @@ -10,7 +10,6 @@ OPEN_LLM_LIST, PRETRAINED_OPEN_LLM_LIST, is_benchmark_conducted, - is_benchmark_supported, ) from optimum_benchmark import ( Benchmark, @@ -56,6 +55,13 @@ LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") +def is_benchmark_supported(weights_config, attn_implementation, hardware): + if attn_implementation == "flash_attention_2": + return False + + return True + + def benchmark_intel(model, attn_implementation, weights_config): benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" subfolder = f"{benchmark_name}/{model.replace('/', '--')}" diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py index 8b65e1f5c..3a216aca6 100644 --- a/llm_perf/update_llm_perf_cuda_pytorch.py +++ b/llm_perf/update_llm_perf_cuda_pytorch.py @@ -10,15 +10,12 @@ OPEN_LLM_LIST, PRETRAINED_OPEN_LLM_LIST, is_benchmark_conducted, - is_benchmark_supported, ) from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig from optimum_benchmark.logging_utils import setup_logging SUBSET = os.getenv("SUBSET", None) MACHINE = os.getenv("MACHINE", None) -HARDWARE = "cuda" - if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug" @@ -97,6 +94,13 @@ LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") +def is_benchmark_supported(weights_config, attn_implementation): + if attn_implementation == "flash_attention_2" and weights_config == "float32": + return False + + return True + + def benchmark_cuda_pytorch(model, attn_implementation, weights_config): benchmark_name = f"{weights_config}-{attn_implementation}" subfolder = f"{benchmark_name}/{model.replace('/', '--')}" @@ -105,7 +109,7 @@ def benchmark_cuda_pytorch(model, attn_implementation, weights_config): quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): + if not is_benchmark_supported(weights_config, attn_implementation): LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") return diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 84461f45b..df45288f5 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -20,7 +20,9 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware_backend: """ Gather the benchmarks for a given machine """ - perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware_backend=hardware_backend) + perf_repo_id = PERF_REPO_ID.format( + subset=subset, machine=machine, backend=backend, hardware_backend=hardware_backend + ) snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"]) dfs = [] diff --git a/llm_perf/utils.py b/llm_perf/utils.py index 8a5c43a83..94cacc7c0 100644 --- a/llm_perf/utils.py +++ b/llm_perf/utils.py @@ -141,17 +141,6 @@ def is_benchmark_conducted(push_repo_id, subfolder): return False -def is_benchmark_supported(weights_config, attn_implementation, hardware): - if hardware == "cuda": - if attn_implementation == "flash_attention_2" and weights_config == "float32": - return False - elif hardware == "intel": - if attn_implementation == "flash_attention_2": - return False - - return True - - class HardwareType(Enum): CPU = auto() GPU = auto() From b6b947fdba1f9a7350c531633528700b0acf0c29 Mon Sep 17 00:00:00 2001 From: baptiste Date: Fri, 6 Sep 2024 14:11:29 +0000 Subject: [PATCH 45/73] add new workflow --- llm_perf/utils.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/llm_perf/utils.py b/llm_perf/utils.py index 94cacc7c0..8b0d78588 100644 --- a/llm_perf/utils.py +++ b/llm_perf/utils.py @@ -140,26 +140,19 @@ def is_benchmark_conducted(push_repo_id, subfolder): except Exception: return False - -class HardwareType(Enum): - CPU = auto() - GPU = auto() - - class HardwareConfig: def __init__(self, data: Dict[str, Any]): self.machine = data["machine"] self.description = data["description"] self.hardware_provider = data["hardware provider"] - self.hardware_type = data["hardware type"] - assert self.hardware_type in HardwareType, f"Hardware type {self.hardware_type} not supported" + self.hardware_backend = data["hardware_backend type"] self.subsets = data["subsets"] self.backends = data["backends"] def __repr__(self): return ( f"HardwareConfig(machine='{self.machine}', description='{self.description}', " - f"hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})" + f"hardware_type={self.hardware_backend}, subsets={self.subsets}, backends={self.backends})" ) From 7a891c1f0b9888a5d5a942dfeadfecaefb75ad2a Mon Sep 17 00:00:00 2001 From: baptiste Date: Fri, 6 Sep 2024 14:13:32 +0000 Subject: [PATCH 46/73] add new workflow --- llm_perf/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_perf/utils.py b/llm_perf/utils.py index 8b0d78588..aa4b961e5 100644 --- a/llm_perf/utils.py +++ b/llm_perf/utils.py @@ -1,4 +1,3 @@ -from enum import Enum, auto from typing import Any, Dict, List import pandas as pd @@ -140,6 +139,7 @@ def is_benchmark_conducted(push_repo_id, subfolder): except Exception: return False + class HardwareConfig: def __init__(self, data: Dict[str, Any]): self.machine = data["machine"] From a6f289bbbc920b2f78efa129a08e33c8214f6353 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 12:23:36 +0000 Subject: [PATCH 47/73] remove intel reference --- .github/workflows/update_llm_perf_intel_pytorch.yaml | 2 +- llm_perf/update_llm_perf_cpu_pytorch.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml index 6032182f2..3632d5412 100644 --- a/.github/workflows/update_llm_perf_intel_pytorch.yaml +++ b/.github/workflows/update_llm_perf_intel_pytorch.yaml @@ -50,5 +50,5 @@ jobs: run: | pip install packaging && pip install einops scipy optimum codecarbon pip install -U transformers huggingface_hub[hf_transfer] - pip install -e .[onnxruntime,openvino] + pip install -e . python llm_perf/update_llm_perf_cpu_pytorch.py diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py index 1e8cce1bc..def443cfe 100644 --- a/llm_perf/update_llm_perf_cpu_pytorch.py +++ b/llm_perf/update_llm_perf_cpu_pytorch.py @@ -24,14 +24,14 @@ SUBSET = os.getenv("SUBSET", None) MACHINE = os.getenv("MACHINE", None) BACKEND = "pytorch" -HARDWARE = "intel" +HARDWARE = "cpu" if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-debug" + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug" CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] SUBSET = "unquantized" elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-intel-{SUBSET}-{MACHINE}" + PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}" else: raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") @@ -62,7 +62,7 @@ def is_benchmark_supported(weights_config, attn_implementation, hardware): return True -def benchmark_intel(model, attn_implementation, weights_config): +def benchmark_cpu_pytorch(model, attn_implementation, weights_config): benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" subfolder = f"{benchmark_name}/{model.replace('/', '--')}" @@ -144,4 +144,4 @@ def benchmark_intel(model, attn_implementation, weights_config): ) for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_intel(model, attn_implementation, weights_config) + benchmark_cpu_pytorch(model, attn_implementation, weights_config) From e97ee56109f239c5c050439992a7bf391ebbb1b1 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 12:25:02 +0000 Subject: [PATCH 48/73] remove intel reference --- .../update_llm_perf_intel_pytorch.yaml | 54 ------------------- 1 file changed, 54 deletions(-) delete mode 100644 .github/workflows/update_llm_perf_intel_pytorch.yaml diff --git a/.github/workflows/update_llm_perf_intel_pytorch.yaml b/.github/workflows/update_llm_perf_intel_pytorch.yaml deleted file mode 100644 index 3632d5412..000000000 --- a/.github/workflows/update_llm_perf_intel_pytorch.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: Update LLM Perf Benchmarks - Intel PyTorch - -on: - workflow_dispatch: - push: - schedule: - - cron: "0 0 * * *" - -concurrency: - cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} - -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu - -jobs: - run_benchmarks: - strategy: - fail-fast: false - matrix: - subset: [unquantized] - machine: [ - {name: 32vCPU-C7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, - ] - - runs-on: ${{ matrix.machine.runs-on }} - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Run benchmarks - uses: addnab/docker-run-action@v3 - env: - SUBSET: ${{ matrix.subset }} - MACHINE: ${{ matrix.machine.name }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} - with: - image: ${{ env.IMAGE }} - options: | - --rm - --shm-size 64G - --env SUBSET - --env MACHINE - --env HF_TOKEN - --env MKL_THREADING_LAYER=GNU - --env HF_HUB_ENABLE_HF_TRANSFER=1 - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install packaging && pip install einops scipy optimum codecarbon - pip install -U transformers huggingface_hub[hf_transfer] - pip install -e . - python llm_perf/update_llm_perf_cpu_pytorch.py From f5f0eebce2e76fa7d23ab7f7aa9a851d9b5c34a5 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 12:25:09 +0000 Subject: [PATCH 49/73] remove intel reference --- .../update_llm_perf_cpu_pytorch.yaml | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 .github/workflows/update_llm_perf_cpu_pytorch.yaml diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml new file mode 100644 index 000000000..3632d5412 --- /dev/null +++ b/.github/workflows/update_llm_perf_cpu_pytorch.yaml @@ -0,0 +1,54 @@ +name: Update LLM Perf Benchmarks - Intel PyTorch + +on: + workflow_dispatch: + push: + schedule: + - cron: "0 0 * * *" + +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +env: + IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu + +jobs: + run_benchmarks: + strategy: + fail-fast: false + matrix: + subset: [unquantized] + machine: [ + {name: 32vCPU-C7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, + ] + + runs-on: ${{ matrix.machine.runs-on }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run benchmarks + uses: addnab/docker-run-action@v3 + env: + SUBSET: ${{ matrix.subset }} + MACHINE: ${{ matrix.machine.name }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + with: + image: ${{ env.IMAGE }} + options: | + --rm + --shm-size 64G + --env SUBSET + --env MACHINE + --env HF_TOKEN + --env MKL_THREADING_LAYER=GNU + --env HF_HUB_ENABLE_HF_TRANSFER=1 + --volume ${{ github.workspace }}:/workspace + --workdir /workspace + run: | + pip install packaging && pip install einops scipy optimum codecarbon + pip install -U transformers huggingface_hub[hf_transfer] + pip install -e . + python llm_perf/update_llm_perf_cpu_pytorch.py From 55e2c69014badb8df5e6d57f57399fe5b22e587b Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 13:34:39 +0000 Subject: [PATCH 50/73] refractoring done --- .../update_llm_perf_cpu_pytorch.yaml | 54 ----- .../update_llm_perf_cuda_pytorch.yaml | 2 +- .../update_llm_perf_cuda_pytorch.py | 126 ++++++++++++ llm_perf/common/benchmark_runner.py | 116 +++++++++++ llm_perf/common/hardware_config.py | 25 +++ llm_perf/{ => common}/utils.py | 38 ---- llm_perf/hardware.yml | 13 +- llm_perf/update_llm_perf_cpu_pytorch.py | 147 -------------- llm_perf/update_llm_perf_cuda_pytorch.py | 186 ------------------ llm_perf/update_llm_perf_leaderboard.py | 2 +- 10 files changed, 270 insertions(+), 439 deletions(-) delete mode 100644 .github/workflows/update_llm_perf_cpu_pytorch.yaml create mode 100644 llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py create mode 100644 llm_perf/common/benchmark_runner.py create mode 100644 llm_perf/common/hardware_config.py rename llm_perf/{ => common}/utils.py (75%) delete mode 100644 llm_perf/update_llm_perf_cpu_pytorch.py delete mode 100644 llm_perf/update_llm_perf_cuda_pytorch.py diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml deleted file mode 100644 index 3632d5412..000000000 --- a/.github/workflows/update_llm_perf_cpu_pytorch.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: Update LLM Perf Benchmarks - Intel PyTorch - -on: - workflow_dispatch: - push: - schedule: - - cron: "0 0 * * *" - -concurrency: - cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} - -env: - IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu - -jobs: - run_benchmarks: - strategy: - fail-fast: false - matrix: - subset: [unquantized] - machine: [ - {name: 32vCPU-C7i, runs-on: {group: 'aws-c7i-8xlarge-plus'}}, - ] - - runs-on: ${{ matrix.machine.runs-on }} - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Run benchmarks - uses: addnab/docker-run-action@v3 - env: - SUBSET: ${{ matrix.subset }} - MACHINE: ${{ matrix.machine.name }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} - with: - image: ${{ env.IMAGE }} - options: | - --rm - --shm-size 64G - --env SUBSET - --env MACHINE - --env HF_TOKEN - --env MKL_THREADING_LAYER=GNU - --env HF_HUB_ENABLE_HF_TRANSFER=1 - --volume ${{ github.workspace }}:/workspace - --workdir /workspace - run: | - pip install packaging && pip install einops scipy optimum codecarbon - pip install -U transformers huggingface_hub[hf_transfer] - pip install -e . - python llm_perf/update_llm_perf_cpu_pytorch.py diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index 0ab646ab9..658e63fd1 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -53,4 +53,4 @@ jobs: pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . - python llm_perf/update_llm_perf_cuda_pytorch.py + python llm_perf/benchmarks/update_llm_perf_cuda_pytorch.py diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py new file mode 100644 index 000000000..f22b28eb0 --- /dev/null +++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py @@ -0,0 +1,126 @@ +from typing import Any, Dict, List + +from llm_perf.common.benchmark_runner import BenchmarkRunner +from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES +from optimum_benchmark import PyTorchConfig +from optimum_benchmark.benchmark.config import BenchmarkConfig +from optimum_benchmark.launchers.process.config import ProcessConfig +from optimum_benchmark.scenarios.inference.config import InferenceConfig + + +class CUDAPyTorchBenchmarkRunner(BenchmarkRunner): + def __init__(self): + super().__init__(backend="pytorch", hardware="cuda") + + def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool: + if attn_implementation == "flash_attention_2" and weights_config == "float32": + return False + return True + + def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig: + assert ( + weights_config in self.weights_configs + ), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue" + + torch_dtype = self.weights_configs[weights_config]["torch_dtype"] + quant_scheme = self.weights_configs[weights_config]["quant_scheme"] + quant_config = self.weights_configs[weights_config]["quant_config"] + + launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill") + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + backend_config = PyTorchConfig( + model=model, + device="cuda", + device_ids="0", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_scheme=quant_scheme, + quantization_config=quant_config, + attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + + return BenchmarkConfig( + name=f"{weights_config}-{attn_implementation}", + scenario=scenario_config, + launcher=launcher_config, + backend=backend_config, + ) + + def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]: + if subset == "unquantized": + return { + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } + elif subset == "bnb": + return { + "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, + "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, + } + elif subset == "gptq": + return { + "4bit-gptq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, + }, + "4bit-gptq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, + }, + } + elif subset == "awq": + return { + "4bit-awq-gemm": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemm"}, + }, + "4bit-awq-gemv": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemv"}, + }, + "4bit-awq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + "4bit-awq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, + }, + }, + } + else: + raise ValueError(f"Unknown subset: {subset}") + + def _get_attention_configs(self) -> List[str]: + return ["eager", "sdpa", "flash_attention_2"] + + +if __name__ == "__main__": + runner = CUDAPyTorchBenchmarkRunner() + runner.run_benchmarks() diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py new file mode 100644 index 000000000..513e6c726 --- /dev/null +++ b/llm_perf/common/benchmark_runner.py @@ -0,0 +1,116 @@ +import os +import traceback +from abc import ABC, abstractmethod +from itertools import product +from logging import getLogger +from typing import Any, Dict, List, Optional + +from llm_perf.common.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + OPEN_LLM_LIST, + PRETRAINED_OPEN_LLM_LIST, +) +from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport +from optimum_benchmark.logging_utils import setup_logging + + +class BenchmarkRunner(ABC): + def __init__(self, backend: str, hardware: str, subset: Optional[str] = None, machine: Optional[str] = None): + self.backend = backend + self.hardware = hardware + self.subset = subset or os.getenv("SUBSET", None) + self.machine = machine or os.getenv("MACHINE", None) + self.logger = getLogger("llm-perf-backend") + + if self.machine is None and self.subset is None: + self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.hardware}-debug" + self.canonical_pretrained_open_llm_list = ["gpt2"] + self.subset = "unquantized" + elif self.machine is not None and self.subset is not None: + self.push_repo_id = ( + f"optimum-benchmark/llm-perf-{self.backend}-{self.hardware}-{self.subset}-{self.machine}" + ) + else: + raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") + + self.attention_configs = self._get_attention_configs() + self.weights_configs = self._get_weights_configs(self.subset) + + self.logger.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") + self.logger.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") + self.logger.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") + + @abstractmethod + def _get_weights_configs(self, subset: str) -> Dict[str, Dict[str, Any]]: + raise NotImplementedError("This method should be implemented in the child class") + + @abstractmethod + def _get_attention_configs(self) -> List[str]: + raise NotImplementedError("This method should be implemented in the child class") + + @abstractmethod + def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool: + raise NotImplementedError("This method should be implemented in the child class") + + def run_benchmarks(self): + os.environ["LOG_TO_FILE"] = "0" + os.environ["LOG_LEVEL"] = "INFO" + setup_logging(level="INFO", prefix="MAIN-PROCESS") + + models_attentions_weights = list( + product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, self.attention_configs, self.weights_configs.keys()) + ) + + self.logger.info( + f"Running a total of {len(models_attentions_weights)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " + f"{len(self.attention_configs)} attentions implementations " + f"and {len(self.weights_configs)} weights configurations." + ) + + for model, attn_implementation, weights_config in models_attentions_weights: + self.run_benchmark(model, attn_implementation, weights_config) + + def is_benchmark_conducted(self, push_repo_id, subfolder): + try: + report = BenchmarkReport.from_pretrained(repo_id=push_repo_id, subfolder=subfolder) + if "traceback" in report.to_dict(): + return False + else: + return True + except Exception: + return False + + def run_benchmark(self, model: str, attn_implementation: str, weights_config: str): + benchmark_name = f"{weights_config}-{attn_implementation}" + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + if not self.is_benchmark_supported(weights_config, attn_implementation): + self.logger.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") + return + + if self.is_benchmark_conducted(self.push_repo_id, subfolder): + self.logger.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") + return + + benchmark_config = self.get_benchmark_config(model, attn_implementation, weights_config) + benchmark_config.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True) + self.execute_and_log_benchmark(benchmark_config, subfolder) + + @abstractmethod + def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig: + raise NotImplementedError("This method should be implemented in the child class") + + def execute_and_log_benchmark(self, benchmark_config: BenchmarkConfig, subfolder: str): + try: + self.logger.info(f"Running benchmark {benchmark_config.name} with model {benchmark_config.backend.model}") + benchmark_report = Benchmark.launch(benchmark_config) + benchmark_report.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True) + except Exception: + self.logger.error(f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}") + benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) + benchmark_report.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True) + benchmark = Benchmark(config=benchmark_config, report=benchmark_report) + benchmark.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True) diff --git a/llm_perf/common/hardware_config.py b/llm_perf/common/hardware_config.py new file mode 100644 index 000000000..5b1cfec3d --- /dev/null +++ b/llm_perf/common/hardware_config.py @@ -0,0 +1,25 @@ +from typing import Any, Dict, List + +import yaml + + +class HardwareConfig: + def __init__(self, data: Dict[str, Any]): + self.machine = data["machine"] + self.description = data["description"] + self.hardware_provider = data["hardware provider"] + self.hardware_backend = data["hardware_backend type"] + self.subsets = data["subsets"] + self.backends = data["backends"] + + def __repr__(self): + return ( + f"HardwareConfig(machine='{self.machine}', description='{self.description}', " + f"hardware_type={self.hardware_backend}, subsets={self.subsets}, backends={self.backends})" + ) + + +def load_hardware_configs(file_path: str) -> List[HardwareConfig]: + with open(file_path, "r") as file: + data = yaml.safe_load(file) + return [HardwareConfig(config) for config in data] diff --git a/llm_perf/utils.py b/llm_perf/common/utils.py similarity index 75% rename from llm_perf/utils.py rename to llm_perf/common/utils.py index aa4b961e5..06eba06a0 100644 --- a/llm_perf/utils.py +++ b/llm_perf/common/utils.py @@ -1,9 +1,4 @@ -from typing import Any, Dict, List - import pandas as pd -import yaml - -from optimum_benchmark.benchmark.report import BenchmarkReport INPUT_SHAPES = {"batch_size": 1, "sequence_length": 256} GENERATE_KWARGS = {"max_new_tokens": 64, "min_new_tokens": 64} @@ -127,36 +122,3 @@ "togethercomputer/RedPajama-INCITE-Base-3B-v1", "togethercomputer/RedPajama-INCITE-Base-7B-v0.1", ] - - -def is_benchmark_conducted(push_repo_id, subfolder): - try: - report = BenchmarkReport.from_pretrained(repo_id=push_repo_id, subfolder=subfolder) - if "traceback" in report.to_dict(): - return False - else: - return True - except Exception: - return False - - -class HardwareConfig: - def __init__(self, data: Dict[str, Any]): - self.machine = data["machine"] - self.description = data["description"] - self.hardware_provider = data["hardware provider"] - self.hardware_backend = data["hardware_backend type"] - self.subsets = data["subsets"] - self.backends = data["backends"] - - def __repr__(self): - return ( - f"HardwareConfig(machine='{self.machine}', description='{self.description}', " - f"hardware_type={self.hardware_backend}, subsets={self.subsets}, backends={self.backends})" - ) - - -def load_hardware_configs(file_path: str) -> List[HardwareConfig]: - with open(file_path, "r") as file: - data = yaml.safe_load(file) - return [HardwareConfig(config) for config in data] diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml index 49819a860..5f0604e35 100644 --- a/llm_perf/hardware.yml +++ b/llm_perf/hardware.yml @@ -32,15 +32,4 @@ - bnb - gptq backends: - - pytorch - -- machine: c7i - description: 4th-Gen-Intel-Xeon-385W 🖥️ - hardware_provider: intel - hardware_backend: cpu - subsets: - - unquantized - backends: - - pytorch - - onnxruntime - - openvino \ No newline at end of file + - pytorch \ No newline at end of file diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py deleted file mode 100644 index def443cfe..000000000 --- a/llm_perf/update_llm_perf_cpu_pytorch.py +++ /dev/null @@ -1,147 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, -) -from optimum_benchmark import ( - Benchmark, - BenchmarkConfig, - BenchmarkReport, - InferenceConfig, - ProcessConfig, - PyTorchConfig, -) -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -BACKEND = "pytorch" -HARDWARE = "cpu" - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa"] - - -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -else: - raise ValueError(f"Subset {SUBSET} not supported") - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def is_benchmark_supported(weights_config, attn_implementation, hardware): - if attn_implementation == "flash_attention_2": - return False - - return True - - -def benchmark_cpu_pytorch(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig() - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - - backend_config = PyTorchConfig( - model=model, - device="cpu", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_cpu_pytorch(model, attn_implementation, weights_config) diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py deleted file mode 100644 index 3a216aca6..000000000 --- a/llm_perf/update_llm_perf_cuda_pytorch.py +++ /dev/null @@ -1,186 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, -) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"] -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -elif SUBSET == "bnb": - WEIGHTS_CONFIGS = { - # bnb - "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, - "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, - } -elif SUBSET == "gptq": - WEIGHTS_CONFIGS = { - # gptq - "4bit-gptq-exllama-v1": { - "quant_scheme": "gptq", - "torch_dtype": "float16", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, - }, - "4bit-gptq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "gptq", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, - }, - } -elif SUBSET == "awq": - WEIGHTS_CONFIGS = { - # awq - "4bit-awq-gemm": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemm"}, - }, - "4bit-awq-gemv": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemv"}, - }, - "4bit-awq-exllama-v1": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - "4bit-awq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - } - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def is_benchmark_supported(weights_config, attn_implementation): - if attn_implementation == "flash_attention_2" and weights_config == "float32": - return False - - return True - - -def benchmark_cuda_pytorch(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill") - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - backend_config = PyTorchConfig( - model=model, - device="cuda", - device_ids="0", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_cuda_pytorch(model, attn_implementation, weights_config) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index df45288f5..04c138d42 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -4,8 +4,8 @@ import pandas as pd from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm -from utils import load_hardware_configs +from llm_perf.common.utils import load_hardware_configs from optimum_benchmark import Benchmark REPO_TYPE = "dataset" From ae7b939563ecf5e237d908aa0bf93b8581d05185 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 13:48:57 +0000 Subject: [PATCH 51/73] refractoring done --- llm_perf/update_llm_perf_leaderboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 04c138d42..057315689 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -5,7 +5,7 @@ from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm -from llm_perf.common.utils import load_hardware_configs +from llm_perf.common.hardware_config import load_hardware_configs from optimum_benchmark import Benchmark REPO_TYPE = "dataset" From 5c80cad3ff9293b75f8bb24137d25994b6c99af6 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 13:50:17 +0000 Subject: [PATCH 52/73] refractoring done --- .github/workflows/update_llm_perf_cuda_pytorch.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index 658e63fd1..5e3f3e976 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -53,4 +53,4 @@ jobs: pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . - python llm_perf/benchmarks/update_llm_perf_cuda_pytorch.py + python llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py From e75a361aa98dfc43ce71387b05d823fd6aaf8db9 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 13:51:35 +0000 Subject: [PATCH 53/73] refractoring done --- llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py | 5 +++-- llm_perf/update_llm_perf_leaderboard.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py index f22b28eb0..d4ebb6d20 100644 --- a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py +++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py @@ -1,12 +1,13 @@ from typing import Any, Dict, List -from llm_perf.common.benchmark_runner import BenchmarkRunner -from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES from optimum_benchmark import PyTorchConfig from optimum_benchmark.benchmark.config import BenchmarkConfig from optimum_benchmark.launchers.process.config import ProcessConfig from optimum_benchmark.scenarios.inference.config import InferenceConfig +from ..common.benchmark_runner import BenchmarkRunner +from ..common.utils import GENERATE_KWARGS, INPUT_SHAPES + class CUDAPyTorchBenchmarkRunner(BenchmarkRunner): def __init__(self): diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 057315689..77c62e347 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -5,9 +5,10 @@ from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm -from llm_perf.common.hardware_config import load_hardware_configs from optimum_benchmark import Benchmark +from .common.hardware_config import load_hardware_configs + REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_backend}-{subset}-{machine}" From 07d1d32c18b28d4b8c5b957aafffe55e8adbea40 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 13:57:59 +0000 Subject: [PATCH 54/73] refractoring done --- llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py | 4 ++-- llm_perf/update_llm_perf_leaderboard.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py index d4ebb6d20..89fb6e0ab 100644 --- a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py +++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py @@ -5,8 +5,8 @@ from optimum_benchmark.launchers.process.config import ProcessConfig from optimum_benchmark.scenarios.inference.config import InferenceConfig -from ..common.benchmark_runner import BenchmarkRunner -from ..common.utils import GENERATE_KWARGS, INPUT_SHAPES +from llm_perf.common.benchmark_runner import BenchmarkRunner +from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES class CUDAPyTorchBenchmarkRunner(BenchmarkRunner): diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 77c62e347..9ab650fbb 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -2,13 +2,12 @@ from glob import glob import pandas as pd +from llm_perf.common.hardware_config import load_hardware_configs from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm from optimum_benchmark import Benchmark -from .common.hardware_config import load_hardware_configs - REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_backend}-{subset}-{machine}" From 34f958f6d9dfdb1b3c8e78121395ea96921f78b1 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 13:58:12 +0000 Subject: [PATCH 55/73] refractoring done --- .github/workflows/update_llm_perf_leaderboard.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml index 10ed80c98..9b63c21cd 100644 --- a/.github/workflows/update_llm_perf_leaderboard.yaml +++ b/.github/workflows/update_llm_perf_leaderboard.yaml @@ -2,6 +2,7 @@ name: Update LLM Perf Leaderboard on: workflow_dispatch: + push: schedule: - cron: "0 */6 * * *" From 35dc1cfc25280dd5bc53391a7e249bb2d1ebdf2e Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 13:59:51 +0000 Subject: [PATCH 56/73] refractoring done --- llm_perf/common/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 llm_perf/common/__init__.py diff --git a/llm_perf/common/__init__.py b/llm_perf/common/__init__.py new file mode 100644 index 000000000..e69de29bb From 93485157d28d9f66ca7c9c3826b6ad32c2e05973 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 14:06:57 +0000 Subject: [PATCH 57/73] refractoring done --- .../update_llm_perf_cuda_pytorch.py | 5 ++--- llm_perf/common/hardware_config.py | 20 +++++++++---------- llm_perf/hardware.yml | 12 +++-------- llm_perf/update_llm_perf_leaderboard.py | 15 +++++++------- 4 files changed, 23 insertions(+), 29 deletions(-) diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py index 89fb6e0ab..f22b28eb0 100644 --- a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py +++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py @@ -1,13 +1,12 @@ from typing import Any, Dict, List +from llm_perf.common.benchmark_runner import BenchmarkRunner +from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES from optimum_benchmark import PyTorchConfig from optimum_benchmark.benchmark.config import BenchmarkConfig from optimum_benchmark.launchers.process.config import ProcessConfig from optimum_benchmark.scenarios.inference.config import InferenceConfig -from llm_perf.common.benchmark_runner import BenchmarkRunner -from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES - class CUDAPyTorchBenchmarkRunner(BenchmarkRunner): def __init__(self): diff --git a/llm_perf/common/hardware_config.py b/llm_perf/common/hardware_config.py index 5b1cfec3d..296c46b28 100644 --- a/llm_perf/common/hardware_config.py +++ b/llm_perf/common/hardware_config.py @@ -2,24 +2,24 @@ import yaml +from dataclasses import dataclass +from typing import List +@dataclass class HardwareConfig: - def __init__(self, data: Dict[str, Any]): - self.machine = data["machine"] - self.description = data["description"] - self.hardware_provider = data["hardware provider"] - self.hardware_backend = data["hardware_backend type"] - self.subsets = data["subsets"] - self.backends = data["backends"] + machine: str + hardware: str + subsets: List[str] + backends: List[str] def __repr__(self): return ( - f"HardwareConfig(machine='{self.machine}', description='{self.description}', " - f"hardware_type={self.hardware_backend}, subsets={self.subsets}, backends={self.backends})" + f"HardwareConfig(machine='{self.machine}', hardware='{self.hardware}', " + f"subsets={self.subsets}, backends={self.backends})" ) def load_hardware_configs(file_path: str) -> List[HardwareConfig]: with open(file_path, "r") as file: data = yaml.safe_load(file) - return [HardwareConfig(config) for config in data] + return [HardwareConfig(**config) for config in data] diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml index 5f0604e35..40f579189 100644 --- a/llm_perf/hardware.yml +++ b/llm_perf/hardware.yml @@ -1,7 +1,5 @@ - machine: 1xA10 - description: A10-24GB-150W 🖥️ - hardware_provider: nvidia - hardware_backend: cuda + hardware: cuda subsets: - unquantized - awq @@ -11,9 +9,7 @@ - pytorch - machine: 1xA100 - description: A100-80GB-275W 🖥️ - hardware_provider: nvidia - hardware_backend: cuda + hardware: cuda subsets: - unquantized - awq @@ -23,9 +19,7 @@ - pytorch - machine: 1xT4 - description: T4-16GB-70W 🖥️ - hardware_provider: nvidia - hardware_backend: cuda + hardware: cuda subsets: - unquantized - awq diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 9ab650fbb..b3cf0888f 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -2,26 +2,26 @@ from glob import glob import pandas as pd -from llm_perf.common.hardware_config import load_hardware_configs from huggingface_hub import create_repo, snapshot_download, upload_file from tqdm import tqdm +from llm_perf.common.hardware_config import load_hardware_configs from optimum_benchmark import Benchmark REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" -PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware_backend}-{subset}-{machine}" +PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}" PERF_DF = "perf-df-{subset}-{machine}.csv" LLM_DF = "llm-df.csv" -def gather_benchmarks(subset: str, machine: str, backend: str, hardware_backend: str): +def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str): """ Gather the benchmarks for a given machine """ perf_repo_id = PERF_REPO_ID.format( - subset=subset, machine=machine, backend=backend, hardware_backend=hardware_backend + subset=subset, machine=machine, backend=backend, hardware=hardware ) snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"]) @@ -46,12 +46,13 @@ def update_perf_dfs(): for subset in hardware_config.subsets: for backend in hardware_config.backends: try: - gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware_provider) + gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware) except Exception as e: print( - f"Error gathering benchmarks for {hardware_config.machine} with {hardware_config.hardware_provider} and {subset} with {backend}: {e}" + f"Error gathering benchmarks for machine {hardware_config.machine}, " + f"hardware {hardware_config.hardware}, subset {subset}, backend {backend}: {e}" ) - + scrapping_script = """ git clone https://github.com/Weyaxi/scrape-open-llm-leaderboard.git From 8b28005da2ec2a7d59bc2723d8de2c27ca984bc7 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 14:07:23 +0000 Subject: [PATCH 58/73] refractoring done --- llm_perf/common/hardware_config.py | 5 ++--- llm_perf/update_llm_perf_leaderboard.py | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/llm_perf/common/hardware_config.py b/llm_perf/common/hardware_config.py index 296c46b28..ed28222e2 100644 --- a/llm_perf/common/hardware_config.py +++ b/llm_perf/common/hardware_config.py @@ -1,9 +1,8 @@ -from typing import Any, Dict, List +from dataclasses import dataclass +from typing import List import yaml -from dataclasses import dataclass -from typing import List @dataclass class HardwareConfig: diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index b3cf0888f..80a7dfbd7 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -20,9 +20,7 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str): """ Gather the benchmarks for a given machine """ - perf_repo_id = PERF_REPO_ID.format( - subset=subset, machine=machine, backend=backend, hardware=hardware - ) + perf_repo_id = PERF_REPO_ID.format(subset=subset, machine=machine, backend=backend, hardware=hardware) snapshot = snapshot_download(repo_type=REPO_TYPE, repo_id=perf_repo_id, allow_patterns=["**/benchmark.json"]) dfs = [] @@ -52,7 +50,7 @@ def update_perf_dfs(): f"Error gathering benchmarks for machine {hardware_config.machine}, " f"hardware {hardware_config.hardware}, subset {subset}, backend {backend}: {e}" ) - + scrapping_script = """ git clone https://github.com/Weyaxi/scrape-open-llm-leaderboard.git From 7cb3ea0bdb45ffb1e24ac708d8632862006b3e72 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 10 Sep 2024 14:32:48 +0000 Subject: [PATCH 59/73] remove push on workflow used for debugging --- .github/workflows/update_llm_perf_leaderboard.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml index 9b63c21cd..10ed80c98 100644 --- a/.github/workflows/update_llm_perf_leaderboard.yaml +++ b/.github/workflows/update_llm_perf_leaderboard.yaml @@ -2,7 +2,6 @@ name: Update LLM Perf Leaderboard on: workflow_dispatch: - push: schedule: - cron: "0 */6 * * *" From c4c888705ca5ab725b9860202974bd551114e6bb Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 12 Sep 2024 11:08:47 +0000 Subject: [PATCH 60/73] refractor pytorch cpu --- .../update_llm_perf_cpu_pytorch.py | 71 +++++++ llm_perf/common/benchmark_runner.py | 3 +- llm_perf/update_llm_perf_cpu_pytorch.py | 147 -------------- llm_perf/update_llm_perf_cuda_pytorch.py | 186 ------------------ 4 files changed, 72 insertions(+), 335 deletions(-) create mode 100644 llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py delete mode 100644 llm_perf/update_llm_perf_cpu_pytorch.py delete mode 100644 llm_perf/update_llm_perf_cuda_pytorch.py diff --git a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py new file mode 100644 index 000000000..62a52a67b --- /dev/null +++ b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py @@ -0,0 +1,71 @@ +from typing import Any, Dict, List + +from llm_perf.common.benchmark_runner import BenchmarkRunner +from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES +from optimum_benchmark import PyTorchConfig +from optimum_benchmark.benchmark.config import BenchmarkConfig +from optimum_benchmark.launchers.process.config import ProcessConfig +from optimum_benchmark.scenarios.inference.config import InferenceConfig + + +class CPUPyTorchBenchmarkRunner(BenchmarkRunner): + def __init__(self): + super().__init__(backend="pytorch", hardware="cpu") + + def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig: + assert ( + weights_config in self.weights_configs + ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue" + + torch_dtype = self.weights_configs[weights_config]["torch_dtype"] + quant_scheme = self.weights_configs[weights_config]["quant_scheme"] + quant_config = self.weights_configs[weights_config]["quant_config"] + + launcher_config = ProcessConfig() + scenario_config = InferenceConfig( + memory=True, + energy=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + backend_config = PyTorchConfig( + model=model, + device="cpu", + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_scheme=quant_scheme, + quantization_config=quant_config, + attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + + return BenchmarkConfig( + name=f"{weights_config}-{attn_implementation}", + scenario=scenario_config, + launcher=launcher_config, + backend=backend_config, + ) + + def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]: + if subset == "unquantized": + return { + "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, + "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, + "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, + } + else: + raise ValueError(f"Unknown subset: {subset}") + + def _get_attention_configs(self) -> List[str]: + return ["eager", "sdpa"] + + +if __name__ == "__main__": + runner = CPUPyTorchBenchmarkRunner() + runner.run_benchmarks() diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py index 513e6c726..40673e126 100644 --- a/llm_perf/common/benchmark_runner.py +++ b/llm_perf/common/benchmark_runner.py @@ -48,9 +48,8 @@ def _get_weights_configs(self, subset: str) -> Dict[str, Dict[str, Any]]: def _get_attention_configs(self) -> List[str]: raise NotImplementedError("This method should be implemented in the child class") - @abstractmethod def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool: - raise NotImplementedError("This method should be implemented in the child class") + return True def run_benchmarks(self): os.environ["LOG_TO_FILE"] = "0" diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py deleted file mode 100644 index def443cfe..000000000 --- a/llm_perf/update_llm_perf_cpu_pytorch.py +++ /dev/null @@ -1,147 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, -) -from optimum_benchmark import ( - Benchmark, - BenchmarkConfig, - BenchmarkReport, - InferenceConfig, - ProcessConfig, - PyTorchConfig, -) -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -BACKEND = "pytorch" -HARDWARE = "cpu" - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa"] - - -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -else: - raise ValueError(f"Subset {SUBSET} not supported") - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def is_benchmark_supported(weights_config, attn_implementation, hardware): - if attn_implementation == "flash_attention_2": - return False - - return True - - -def benchmark_cpu_pytorch(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig() - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - - backend_config = PyTorchConfig( - model=model, - device="cpu", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_cpu_pytorch(model, attn_implementation, weights_config) diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py deleted file mode 100644 index 3a216aca6..000000000 --- a/llm_perf/update_llm_perf_cuda_pytorch.py +++ /dev/null @@ -1,186 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, -) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"] -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -elif SUBSET == "bnb": - WEIGHTS_CONFIGS = { - # bnb - "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, - "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, - } -elif SUBSET == "gptq": - WEIGHTS_CONFIGS = { - # gptq - "4bit-gptq-exllama-v1": { - "quant_scheme": "gptq", - "torch_dtype": "float16", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, - }, - "4bit-gptq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "gptq", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, - }, - } -elif SUBSET == "awq": - WEIGHTS_CONFIGS = { - # awq - "4bit-awq-gemm": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemm"}, - }, - "4bit-awq-gemv": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemv"}, - }, - "4bit-awq-exllama-v1": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - "4bit-awq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - } - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def is_benchmark_supported(weights_config, attn_implementation): - if attn_implementation == "flash_attention_2" and weights_config == "float32": - return False - - return True - - -def benchmark_cuda_pytorch(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill") - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - backend_config = PyTorchConfig( - model=model, - device="cuda", - device_ids="0", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_cuda_pytorch(model, attn_implementation, weights_config) From 32626f97fd22993fdbb52bdfda8cc7553c7bc139 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 12 Sep 2024 11:09:33 +0000 Subject: [PATCH 61/73] refractor pytorch cpu --- llm_perf/hardware.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llm_perf/hardware.yml b/llm_perf/hardware.yml index 40f579189..1a351b674 100644 --- a/llm_perf/hardware.yml +++ b/llm_perf/hardware.yml @@ -25,5 +25,12 @@ - awq - bnb - gptq + backends: + - pytorch + +- machine: 32vCPU-C7i + hardware: cpu + subsets: + - unquantized backends: - pytorch \ No newline at end of file From 99a00dfd3eb342981380593eb6d0c409cd392b31 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 12 Sep 2024 11:09:42 +0000 Subject: [PATCH 62/73] refractor pytorch cpu --- llm_perf/update_llm_perf_leaderboard.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index a99a0fc3a..619e54224 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -45,10 +45,10 @@ def update_perf_dfs(): for backend in hardware_config.backends: try: gather_benchmarks(subset, hardware_config.machine, backend, hardware_config.hardware) - except Exception as e: + except Exception: print( - f"benchmark for subset: {subset}, machine: {hardware_config.machine}, backend: {backend}, hardware: {hardware_config.hardware} not found" - ) + f"benchmark for subset: {subset}, machine: {hardware_config.machine}, backend: {backend}, hardware: {hardware_config.hardware} not found" + ) scrapping_script = """ From b27f80609089e4e90e6bde327ec48cf5324f5368 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 12 Sep 2024 16:46:16 +0000 Subject: [PATCH 63/73] fix failling workflow --- .github/workflows/update_llm_perf_cpu_pytorch.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml index 5a046a804..4fb972879 100644 --- a/.github/workflows/update_llm_perf_cpu_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cpu_pytorch.yaml @@ -50,4 +50,4 @@ jobs: pip install packaging && pip install einops scipy optimum codecarbon pip install -U transformers huggingface_hub[hf_transfer] pip install -e . - python llm_perf/update_llm_perf_cpu_pytorch.py + python llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py From 10c47eaf2f753c3664720b10c07eed9f84c736f9 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 17 Sep 2024 11:03:15 +0000 Subject: [PATCH 64/73] fix broken canonical list --- llm_perf/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_perf/utils.py b/llm_perf/utils.py index 1f478913a..849f6adae 100644 --- a/llm_perf/utils.py +++ b/llm_perf/utils.py @@ -93,7 +93,7 @@ "google/gemma-2b", "google/gemma-7b", "google/recurrentgemma-2b", - "google/recurrentgemma-7b", + "google/recurrentgemma-9b", "internlm/internlm-20b", "internlm/internlm2-20b", "huggyllama/llama-7b", From 60aa33e257f53a74d9a00f3840161fc591e65c07 Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 17 Sep 2024 12:04:50 +0000 Subject: [PATCH 65/73] fix broken canonical list --- llm_perf/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_perf/utils.py b/llm_perf/utils.py index 849f6adae..6a5584284 100644 --- a/llm_perf/utils.py +++ b/llm_perf/utils.py @@ -109,7 +109,7 @@ "microsoft/rho-math-1b-v0.1", "mistralai/Mistral-7B-v0.1", "mistralai/Mixtral-8x7B-v0.1", - *"mistralai/Mixtral-8x22B-v0.1", + "mistralai/Mixtral-8x22B-v0.1", "openai-community/gpt2", "openai-community/gpt2-large", "stabilityai/stablelm-3b-4e1t", From f3bc069168614c35f04d6e4c61798c180699f0b5 Mon Sep 17 00:00:00 2001 From: baptiste Date: Fri, 20 Sep 2024 12:00:26 +0000 Subject: [PATCH 66/73] merge main --- optimum_benchmark/trackers/latency.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index 1e0f1e95b..b6d5b0257 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -121,8 +121,9 @@ def __init__(self, device: str, backend: str): self.device = device self.backend = backend self.is_asynchronous = self.backend == "pytorch" and self.device == "cuda" - self.is_distributed = (self.backend != "vllm" and - is_torch_distributed_available() and torch.distributed.is_initialized()) + self.is_distributed = ( + self.backend != "vllm" and is_torch_distributed_available() and torch.distributed.is_initialized() + ) if self.is_asynchronous: LOGGER.info("\t+ Tracking latency using Pytorch CUDA events") From b2d5f1247cbd694901f7df97e9017691d56ba473 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 23 Sep 2024 06:56:12 +0000 Subject: [PATCH 67/73] merge main into branch --- llm_perf/update_llm_perf_cpu_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py index def443cfe..39723b0fd 100644 --- a/llm_perf/update_llm_perf_cpu_pytorch.py +++ b/llm_perf/update_llm_perf_cpu_pytorch.py @@ -28,7 +28,7 @@ if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] # noqa: F811 SUBSET = "unquantized" elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}" From 08f70e2c5329a6035b59e8f893a3b345758a8156 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 23 Sep 2024 06:56:46 +0000 Subject: [PATCH 68/73] merge main into branch --- llm_perf/update_llm_perf_cpu_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py index 39723b0fd..250355505 100644 --- a/llm_perf/update_llm_perf_cpu_pytorch.py +++ b/llm_perf/update_llm_perf_cpu_pytorch.py @@ -28,7 +28,7 @@ if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] # noqa: F811 + CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] # noqa: F811 SUBSET = "unquantized" elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}" From ab1710a27cfcba43da3337096873262c600e90f3 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 23 Sep 2024 06:58:09 +0000 Subject: [PATCH 69/73] merge main into branch --- llm_perf/update_llm_perf_cpu_pytorch.py | 147 ------------------ llm_perf/update_llm_perf_cuda_pytorch.py | 186 ----------------------- 2 files changed, 333 deletions(-) delete mode 100644 llm_perf/update_llm_perf_cpu_pytorch.py delete mode 100644 llm_perf/update_llm_perf_cuda_pytorch.py diff --git a/llm_perf/update_llm_perf_cpu_pytorch.py b/llm_perf/update_llm_perf_cpu_pytorch.py deleted file mode 100644 index 250355505..000000000 --- a/llm_perf/update_llm_perf_cpu_pytorch.py +++ /dev/null @@ -1,147 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, -) -from optimum_benchmark import ( - Benchmark, - BenchmarkConfig, - BenchmarkReport, - InferenceConfig, - ProcessConfig, - PyTorchConfig, -) -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) -BACKEND = "pytorch" -HARDWARE = "cpu" - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] # noqa: F811 - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-{BACKEND}-{HARDWARE}-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa"] - - -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -else: - raise ValueError(f"Subset {SUBSET} not supported") - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def is_benchmark_supported(weights_config, attn_implementation, hardware): - if attn_implementation == "flash_attention_2": - return False - - return True - - -def benchmark_cpu_pytorch(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}-{BACKEND}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation, HARDWARE): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig() - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - - backend_config = PyTorchConfig( - model=model, - device="cpu", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_cpu_pytorch(model, attn_implementation, weights_config) diff --git a/llm_perf/update_llm_perf_cuda_pytorch.py b/llm_perf/update_llm_perf_cuda_pytorch.py deleted file mode 100644 index 98914f6ad..000000000 --- a/llm_perf/update_llm_perf_cuda_pytorch.py +++ /dev/null @@ -1,186 +0,0 @@ -import os -import traceback -from itertools import product -from logging import getLogger - -from llm_perf.utils import ( - CANONICAL_PRETRAINED_OPEN_LLM_LIST, - GENERATE_KWARGS, - INPUT_SHAPES, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, - is_benchmark_conducted, -) -from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport, InferenceConfig, ProcessConfig, PyTorchConfig -from optimum_benchmark.logging_utils import setup_logging - -SUBSET = os.getenv("SUBSET", None) -MACHINE = os.getenv("MACHINE", None) - -if os.getenv("MACHINE", None) is None and os.getenv("SUBSET", None) is None: - PUSH_REPO_ID = "optimum-benchmark/llm-perf-pytorch-cuda-debug" - CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["gpt2"] # noqa: F811 - SUBSET = "unquantized" -elif os.getenv("MACHINE", None) is not None and os.getenv("SUBSET", None) is not None: - PUSH_REPO_ID = f"optimum-benchmark/llm-perf-pytorch-cuda-{SUBSET}-{MACHINE}" -else: - raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - -ATTENTION_CONFIGS = ["eager", "sdpa", "flash_attention_2"] -if SUBSET == "unquantized": - WEIGHTS_CONFIGS = { - # unquantized - "float32": {"torch_dtype": "float32", "quant_scheme": None, "quant_config": {}}, - "float16": {"torch_dtype": "float16", "quant_scheme": None, "quant_config": {}}, - "bfloat16": {"torch_dtype": "bfloat16", "quant_scheme": None, "quant_config": {}}, - } -elif SUBSET == "bnb": - WEIGHTS_CONFIGS = { - # bnb - "4bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_4bit": True}}, - "8bit-bnb": {"torch_dtype": "float16", "quant_scheme": "bnb", "quant_config": {"load_in_8bit": True}}, - } -elif SUBSET == "gptq": - WEIGHTS_CONFIGS = { - # gptq - "4bit-gptq-exllama-v1": { - "quant_scheme": "gptq", - "torch_dtype": "float16", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 1, "model_seqlen": 256}, - }, - "4bit-gptq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "gptq", - "quant_config": {"bits": 4, "use_exllama ": True, "version": 2, "model_seqlen": 256}, - }, - } -elif SUBSET == "awq": - WEIGHTS_CONFIGS = { - # awq - "4bit-awq-gemm": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemm"}, - }, - "4bit-awq-gemv": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": {"bits": 4, "version": "gemv"}, - }, - "4bit-awq-exllama-v1": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 1, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - "4bit-awq-exllama-v2": { - "torch_dtype": "float16", - "quant_scheme": "awq", - "quant_config": { - "bits": 4, - "version": "exllama", - "exllama_config": {"version": 2, "max_input_len": 64, "max_batch_size": 1}, - }, - }, - } - - -LOGGER = getLogger("llm-perf-backend") -LOGGER.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") -LOGGER.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") -LOGGER.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") - - -def is_benchmark_supported(weights_config, attn_implementation): - if attn_implementation == "flash_attention_2" and weights_config == "float32": - return False - - return True - - -def benchmark_cuda_pytorch(model, attn_implementation, weights_config): - benchmark_name = f"{weights_config}-{attn_implementation}" - subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - - torch_dtype = WEIGHTS_CONFIGS[weights_config]["torch_dtype"] - quant_scheme = WEIGHTS_CONFIGS[weights_config]["quant_scheme"] - quant_config = WEIGHTS_CONFIGS[weights_config]["quant_config"] - - if not is_benchmark_supported(weights_config, attn_implementation): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") - return - - if is_benchmark_conducted(PUSH_REPO_ID, subfolder): - LOGGER.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") - return - - launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill") - scenario_config = InferenceConfig( - memory=True, - energy=True, - latency=True, - duration=10, - iterations=10, - warmup_runs=10, - input_shapes=INPUT_SHAPES, - generate_kwargs=GENERATE_KWARGS, - ) - backend_config = PyTorchConfig( - model=model, - device="cuda", - device_ids="0", - no_weights=True, - library="transformers", - task="text-generation", - torch_dtype=torch_dtype, - quantization_scheme=quant_scheme, - quantization_config=quant_config, - attn_implementation=attn_implementation, - model_kwargs={"trust_remote_code": True}, - ) - - benchmark_config = BenchmarkConfig( - name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config - ) - - benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - try: - LOGGER.info(f"Running benchmark {benchmark_name} with model {model}") - benchmark_report = Benchmark.launch(benchmark_config) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - except Exception: - LOGGER.error(f"Benchmark {benchmark_name} failed with model {model}") - benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) - benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=subfolder, private=True) - - -if __name__ == "__main__": - # for isolated process - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - - # for main process - setup_logging(level="INFO", prefix="MAIN-PROCESS") - - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, ATTENTION_CONFIGS, WEIGHTS_CONFIGS.keys()) - ) - - LOGGER.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(ATTENTION_CONFIGS)} attentions implementations " - f"and {len(WEIGHTS_CONFIGS)} weights configurations." - ) - - for model, attn_implementation, weights_config in models_attentions_weights: - benchmark_cuda_pytorch(model, attn_implementation, weights_config) From 25128277e1c6a2cdbfa82d2b0bd0efd1f5b3cc72 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 23 Sep 2024 07:08:25 +0000 Subject: [PATCH 70/73] add new label system --- .../update_llm_perf_cpu_pytorch.yaml | 16 ++++++++++++++++ .../update_llm_perf_cuda_pytorch.yaml | 5 +++++ .../update_llm_perf_leaderboard.yaml | 19 ++++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml index 9e002d70b..e24a1aa69 100644 --- a/.github/workflows/update_llm_perf_cpu_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cpu_pytorch.yaml @@ -4,6 +4,18 @@ on: workflow_dispatch: schedule: - cron: "0 0 * * *" + push: + branches: + - main + pull_request: + branches: + - main + types: + - opened + - reopened + - synchronize + - labeled + - unlabeled concurrency: cancel-in-progress: true @@ -14,6 +26,10 @@ env: jobs: run_benchmarks: + if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'leaderboard')}} strategy: fail-fast: false matrix: diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index ab36c3b9c..9a60ab931 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -14,6 +14,11 @@ env: jobs: run_benchmarks: + if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'leaderboard')}} + strategy: fail-fast: false matrix: diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml index 10ed80c98..f0a6b7a43 100644 --- a/.github/workflows/update_llm_perf_leaderboard.yaml +++ b/.github/workflows/update_llm_perf_leaderboard.yaml @@ -4,13 +4,30 @@ on: workflow_dispatch: schedule: - cron: "0 */6 * * *" + push: + branches: + - main + pull_request: + branches: + - main + types: + - opened + - reopened + - synchronize + - labeled + - unlabeled concurrency: cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} jobs: update_llm_perf_leaderboard: + if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'leaderboard')}} + runs-on: ubuntu-latest steps: - name: Checkout From defc78abaa3819db117a3d256922db62260a4eda Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 23 Sep 2024 07:09:51 +0000 Subject: [PATCH 71/73] add new label system --- .github/workflows/update_llm_perf_cpu_pytorch.yaml | 2 +- .../workflows/update_llm_perf_cuda_pytorch.yaml | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml index e24a1aa69..eadacc26a 100644 --- a/.github/workflows/update_llm_perf_cpu_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cpu_pytorch.yaml @@ -19,7 +19,7 @@ on: concurrency: cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} env: IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cpu diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index 9a60ab931..01aed81e7 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -4,10 +4,22 @@ on: workflow_dispatch: schedule: - cron: "0 0 * * *" + push: + branches: + - main + pull_request: + branches: + - main + types: + - opened + - reopened + - synchronize + - labeled + - unlabeled concurrency: cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} env: IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-cuda From 89b6a97a171738c6d66996e253e7990946dfb3ba Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 23 Sep 2024 09:34:59 +0000 Subject: [PATCH 72/73] add new chnages from review --- .../update_llm_perf_cpu_pytorch.py | 32 +++++++++-- .../update_llm_perf_cuda_pytorch.py | 35 +++++++++--- llm_perf/common/benchmark_runner.py | 57 ++++++++++--------- 3 files changed, 85 insertions(+), 39 deletions(-) diff --git a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py index 62a52a67b..c6ce76290 100644 --- a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py +++ b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py @@ -1,18 +1,39 @@ +from itertools import product from typing import Any, Dict, List -from llm_perf.common.benchmark_runner import BenchmarkRunner -from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES +from llm_perf.common.benchmark_runner import LLMPerfBenchmarkManager +from llm_perf.common.utils import CANONICAL_PRETRAINED_OPEN_LLM_LIST, GENERATE_KWARGS, INPUT_SHAPES from optimum_benchmark import PyTorchConfig from optimum_benchmark.benchmark.config import BenchmarkConfig from optimum_benchmark.launchers.process.config import ProcessConfig from optimum_benchmark.scenarios.inference.config import InferenceConfig -class CPUPyTorchBenchmarkRunner(BenchmarkRunner): +class CPUPyTorchBenchmarkRunner(LLMPerfBenchmarkManager): def __init__(self): - super().__init__(backend="pytorch", hardware="cpu") + super().__init__(backend="pytorch", device="cpu") + + self.attention_configs = self._get_attention_configs() + assert self.subset is not None, "SUBSET environment variable must be set for benchmarking" + self.weights_configs = self._get_weights_configs(self.subset) + + def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]: + return [ + {"model": model, "attn_implementation": attn_impl, "weights_config": weights_cfg} + for model, attn_impl, weights_cfg in product( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, self.attention_configs, self.weights_configs.keys() + ) + ] + + def get_benchmark_name(self, model: str, **kwargs) -> str: + weights_config = kwargs["weights_config"] + attn_implementation = kwargs["attn_implementation"] + return f"{model}-{weights_config}-{attn_implementation}" + + def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig: + weights_config = kwargs["weights_config"] + attn_implementation = kwargs["attn_implementation"] - def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig: assert ( weights_config in self.weights_configs ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue" @@ -65,7 +86,6 @@ def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]: def _get_attention_configs(self) -> List[str]: return ["eager", "sdpa"] - if __name__ == "__main__": runner = CPUPyTorchBenchmarkRunner() runner.run_benchmarks() diff --git a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py index f22b28eb0..82aab3db9 100644 --- a/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py +++ b/llm_perf/benchmark_runners/update_llm_perf_cuda_pytorch.py @@ -1,23 +1,44 @@ +from itertools import product from typing import Any, Dict, List -from llm_perf.common.benchmark_runner import BenchmarkRunner -from llm_perf.common.utils import GENERATE_KWARGS, INPUT_SHAPES +from llm_perf.common.benchmark_runner import LLMPerfBenchmarkManager +from llm_perf.common.utils import CANONICAL_PRETRAINED_OPEN_LLM_LIST, GENERATE_KWARGS, INPUT_SHAPES from optimum_benchmark import PyTorchConfig from optimum_benchmark.benchmark.config import BenchmarkConfig from optimum_benchmark.launchers.process.config import ProcessConfig from optimum_benchmark.scenarios.inference.config import InferenceConfig -class CUDAPyTorchBenchmarkRunner(BenchmarkRunner): +class CUDAPyTorchBenchmarkRunner(LLMPerfBenchmarkManager): def __init__(self): - super().__init__(backend="pytorch", hardware="cuda") + super().__init__(backend="pytorch", device="cuda") - def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool: - if attn_implementation == "flash_attention_2" and weights_config == "float32": + self.attention_configs = self._get_attention_configs() + assert self.subset is not None, "SUBSET environment variable must be set for benchmarking" + self.weights_configs = self._get_weights_configs(self.subset) + + def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]: + return [ + {"model": model, "attn_implementation": attn_impl, "weights_config": weights_cfg} + for model, attn_impl, weights_cfg in product( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, self.attention_configs, self.weights_configs.keys() + ) + ] + + def get_benchmark_name(self, model: str, **kwargs) -> str: + weights_config = kwargs["weights_config"] + attn_implementation = kwargs["attn_implementation"] + return f"{model}-{weights_config}-{attn_implementation}" + + def is_benchmark_supported(self, **kwargs) -> bool: + if kwargs["attn_implementation"] == "flash_attention_2" and kwargs["weights_config"] == "float32": return False return True - def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig: + def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig: + weights_config = kwargs["weights_config"] + attn_implementation = kwargs["attn_implementation"] + assert ( weights_config in self.weights_configs ), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue" diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py index 40673e126..a889ade4b 100644 --- a/llm_perf/common/benchmark_runner.py +++ b/llm_perf/common/benchmark_runner.py @@ -1,7 +1,6 @@ import os import traceback from abc import ABC, abstractmethod -from itertools import product from logging import getLogger from typing import Any, Dict, List, Optional @@ -14,28 +13,23 @@ from optimum_benchmark.logging_utils import setup_logging -class BenchmarkRunner(ABC): - def __init__(self, backend: str, hardware: str, subset: Optional[str] = None, machine: Optional[str] = None): +class LLMPerfBenchmarkManager(ABC): + def __init__(self, backend: str, device: str, subset: Optional[str] = None, machine: Optional[str] = None): self.backend = backend - self.hardware = hardware + self.device = device self.subset = subset or os.getenv("SUBSET", None) self.machine = machine or os.getenv("MACHINE", None) self.logger = getLogger("llm-perf-backend") if self.machine is None and self.subset is None: - self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.hardware}-debug" + self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.device}-debug" self.canonical_pretrained_open_llm_list = ["gpt2"] self.subset = "unquantized" elif self.machine is not None and self.subset is not None: - self.push_repo_id = ( - f"optimum-benchmark/llm-perf-{self.backend}-{self.hardware}-{self.subset}-{self.machine}" - ) + self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.device}-{self.subset}-{self.machine}" else: raise ValueError("Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging") - self.attention_configs = self._get_attention_configs() - self.weights_configs = self._get_weights_configs(self.subset) - self.logger.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") self.logger.info(f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}") self.logger.info(f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}") @@ -48,27 +42,32 @@ def _get_weights_configs(self, subset: str) -> Dict[str, Dict[str, Any]]: def _get_attention_configs(self) -> List[str]: raise NotImplementedError("This method should be implemented in the child class") - def is_benchmark_supported(self, weights_config: str, attn_implementation: str) -> bool: + def is_benchmark_supported(self, **kwargs) -> bool: + """ + Can be overridden by child classes to exclude unsupported configurations + """ return True + @abstractmethod + def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]: + raise NotImplementedError("This method should be implemented in the child class") + def run_benchmarks(self): os.environ["LOG_TO_FILE"] = "0" os.environ["LOG_LEVEL"] = "INFO" setup_logging(level="INFO", prefix="MAIN-PROCESS") - models_attentions_weights = list( - product(CANONICAL_PRETRAINED_OPEN_LLM_LIST, self.attention_configs, self.weights_configs.keys()) - ) + benchmarks_to_run = self.get_list_of_benchmarks_to_run() self.logger.info( - f"Running a total of {len(models_attentions_weights)} benchmarks, " - f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models, " - f"{len(self.attention_configs)} attentions implementations " - f"and {len(self.weights_configs)} weights configurations." + f"Running a total of {len(benchmarks_to_run)} benchmarks, " + f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models" ) - for model, attn_implementation, weights_config in models_attentions_weights: - self.run_benchmark(model, attn_implementation, weights_config) + for benchmark_name in benchmarks_to_run: + assert "model" in benchmark_name, "each benchmark should have a model" + + self.run_benchmark(**benchmark_name) def is_benchmark_conducted(self, push_repo_id, subfolder): try: @@ -80,11 +79,17 @@ def is_benchmark_conducted(self, push_repo_id, subfolder): except Exception: return False - def run_benchmark(self, model: str, attn_implementation: str, weights_config: str): - benchmark_name = f"{weights_config}-{attn_implementation}" + @abstractmethod + def get_benchmark_name(self, model: str, **kwargs) -> str: + raise NotImplementedError("This method should be implemented in the child class") + + def run_benchmark(self, **kwargs): + model = kwargs["model"] + + benchmark_name = self.get_benchmark_name(model, **kwargs) subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - if not self.is_benchmark_supported(weights_config, attn_implementation): + if not self.is_benchmark_supported(**kwargs): self.logger.info(f"Skipping benchmark {benchmark_name} with model {model} since it is not supported") return @@ -92,12 +97,12 @@ def run_benchmark(self, model: str, attn_implementation: str, weights_config: st self.logger.info(f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted") return - benchmark_config = self.get_benchmark_config(model, attn_implementation, weights_config) + benchmark_config = self.get_benchmark_config(model, **kwargs) benchmark_config.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True) self.execute_and_log_benchmark(benchmark_config, subfolder) @abstractmethod - def get_benchmark_config(self, model: str, attn_implementation: str, weights_config: str) -> BenchmarkConfig: + def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig: raise NotImplementedError("This method should be implemented in the child class") def execute_and_log_benchmark(self, benchmark_config: BenchmarkConfig, subfolder: str): From 3130c87c8f664ee5109fb8a6d327d643e2fe62d8 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 23 Sep 2024 09:41:45 +0000 Subject: [PATCH 73/73] add new chnages from review --- llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py | 1 + llm_perf/common/benchmark_runner.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py index c6ce76290..c27f5e220 100644 --- a/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py +++ b/llm_perf/benchmark_runners/update_llm_perf_cpu_pytorch.py @@ -86,6 +86,7 @@ def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]: def _get_attention_configs(self) -> List[str]: return ["eager", "sdpa"] + if __name__ == "__main__": runner = CPUPyTorchBenchmarkRunner() runner.run_benchmarks() diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py index a889ade4b..def30dc20 100644 --- a/llm_perf/common/benchmark_runner.py +++ b/llm_perf/common/benchmark_runner.py @@ -84,7 +84,7 @@ def get_benchmark_name(self, model: str, **kwargs) -> str: raise NotImplementedError("This method should be implemented in the child class") def run_benchmark(self, **kwargs): - model = kwargs["model"] + model = kwargs.pop("model") benchmark_name = self.get_benchmark_name(model, **kwargs) subfolder = f"{benchmark_name}/{model.replace('/', '--')}" @@ -112,8 +112,10 @@ def execute_and_log_benchmark(self, benchmark_config: BenchmarkConfig, subfolder benchmark_report.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True) benchmark = Benchmark(config=benchmark_config, report=benchmark_report) benchmark.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True) - except Exception: - self.logger.error(f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}") + except Exception as e: + self.logger.error( + f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}, error:\n{e}" + ) benchmark_report = BenchmarkReport.from_dict({"traceback": traceback.format_exc()}) benchmark_report.push_to_hub(repo_id=self.push_repo_id, subfolder=subfolder, private=True) benchmark = Benchmark(config=benchmark_config, report=benchmark_report)