diff --git a/userbenchmark/triton/gpu.py b/userbenchmark/triton/gpu.py new file mode 100644 index 000000000..2b6aa73da --- /dev/null +++ b/userbenchmark/triton/gpu.py @@ -0,0 +1,57 @@ +from contextlib import contextmanager +import os +import subprocess +import logging + +CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + +POWER_LIMIT = { + "NVIDIA PG509-210": "330", + "NVIDIA A100": "330", + "NVIDIA H100": "650", +} +FREQ_LIMIT = { + "NVIDIA PG509-210": "1410", + "NVIDIA A100": "1410", + "NVIDIA H100": "1980", +} + +def _set_pm(): + command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "-pm", "1"] + subprocess.check_call(command) + +def _set_power(gpu_info: str): + command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "--power-limit", POWER_LIMIT[gpu_info]] + subprocess.check_call(command) + +def _set_clock(gpu_info: str): + # lgc: lock gpu clocks + command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "-lgc", FREQ_LIMIT[gpu_info]] + subprocess.check_call(command) + +def _reset_clock(gpu_info: str): + # rgc: reset gpu clocks + command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "-rgc"] + subprocess.check_call(command) + +def _get_gpu_name() -> str: + import pynvml + pynvml.nvmlInit() + gpu_id = CUDA_VISIBLE_DEVICES.split(",")[0] + handle = pynvml.nvmlDeviceGetHandleByIndex(int(gpu_id)) + return pynvml.nvmlDeviceGetName(handle).decode("utf-8") + +@contextmanager +def gpu_lockdown(enabled=True): + try: + if enabled: + logging.info(f"[tritonbench] Locking down GPU {CUDA_VISIBLE_DEVICES}") + gpu_name = _get_gpu_name() + assert gpu_name in POWER_LIMIT, f"Unsupported GPU {gpu_name}" + _set_pm() + _set_power(gpu_name) + _set_clock(gpu_name) + yield + finally: + if enabled: + _reset_clock(gpu_name) diff --git a/userbenchmark/triton/run.py b/userbenchmark/triton/run.py index fb0eae74c..cf5bff17e 100644 --- a/userbenchmark/triton/run.py +++ b/userbenchmark/triton/run.py @@ -2,8 +2,8 @@ import os import sys import tempfile -import torch from typing import List + from torch import version as torch_version from torchbenchmark.operators import load_opbench_by_name @@ -14,13 +14,14 @@ ) try: + import torch if not hasattr(torch.version, "git_version"): from pytorch.benchmark.fb.run_utils import usage_report_logger else: usage_report_logger = lambda *args, **kwargs: None except ImportError: usage_report_logger = lambda *args, **kwargs: None - +from .gpu import gpu_lockdown TRITON_BENCH_CSV_DUMP_PATH = tempfile.gettempdir() + "/tritonbench/" @@ -132,6 +133,11 @@ def get_parser(): action="store_true", help="Dump Triton IR", ) + parser.add_argument( + "--gpu-lockdown", + action="store_true", + help="Lock down GPU frequency and clocks to avoid throttling." + ) if not hasattr(torch_version, "git_version"): parser.add_argument("--log-scuba", action="store_true", help="Log to scuba.") return parser @@ -181,4 +187,5 @@ def run(args: List[str] = []): from .ci import run_ci run_ci() return - _run(args, extra_args) + with gpu_lockdown(args.gpu_lockdown): + _run(args, extra_args)