Skip to content

Commit

Permalink
Add --gpu-lockdown option to pin GPU frequency and add power limit
Browse files Browse the repository at this point in the history
Summary: Use bertmaher's script P1491504558 and an option `--gpu-lockdown` to lockdown GPU frequency and power limit.

Reviewed By: int3

Differential Revision: D60301004

fbshipit-source-id: ff2e303c9d6570c0938919deed9c870c2a61b6b8
  • Loading branch information
xuzhao9 authored and facebook-github-bot committed Jul 30, 2024
1 parent 89cdb8a commit 2698ff8
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 3 deletions.
57 changes: 57 additions & 0 deletions userbenchmark/triton/gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from contextlib import contextmanager
import os
import subprocess
import logging

CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0")

POWER_LIMIT = {
"NVIDIA PG509-210": "330",
"NVIDIA A100": "330",
"NVIDIA H100": "650",
}
FREQ_LIMIT = {
"NVIDIA PG509-210": "1410",
"NVIDIA A100": "1410",
"NVIDIA H100": "1980",
}

def _set_pm():
command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "-pm", "1"]
subprocess.check_call(command)

def _set_power(gpu_info: str):
command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "--power-limit", POWER_LIMIT[gpu_info]]
subprocess.check_call(command)

def _set_clock(gpu_info: str):
# lgc: lock gpu clocks
command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "-lgc", FREQ_LIMIT[gpu_info]]
subprocess.check_call(command)

def _reset_clock(gpu_info: str):
# rgc: reset gpu clocks
command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "-rgc"]
subprocess.check_call(command)

def _get_gpu_name() -> str:
import pynvml
pynvml.nvmlInit()
gpu_id = CUDA_VISIBLE_DEVICES.split(",")[0]
handle = pynvml.nvmlDeviceGetHandleByIndex(int(gpu_id))
return pynvml.nvmlDeviceGetName(handle).decode("utf-8")

@contextmanager
def gpu_lockdown(enabled=True):
try:
if enabled:
logging.info(f"[tritonbench] Locking down GPU {CUDA_VISIBLE_DEVICES}")
gpu_name = _get_gpu_name()
assert gpu_name in POWER_LIMIT, f"Unsupported GPU {gpu_name}"
_set_pm()
_set_power(gpu_name)
_set_clock(gpu_name)
yield
finally:
if enabled:
_reset_clock(gpu_name)
13 changes: 10 additions & 3 deletions userbenchmark/triton/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import os
import sys
import tempfile
import torch
from typing import List

from torch import version as torch_version
from torchbenchmark.operators import load_opbench_by_name

Expand All @@ -14,13 +14,14 @@
)

try:
import torch
if not hasattr(torch.version, "git_version"):
from pytorch.benchmark.fb.run_utils import usage_report_logger
else:
usage_report_logger = lambda *args, **kwargs: None
except ImportError:
usage_report_logger = lambda *args, **kwargs: None

from .gpu import gpu_lockdown

TRITON_BENCH_CSV_DUMP_PATH = tempfile.gettempdir() + "/tritonbench/"

Expand Down Expand Up @@ -132,6 +133,11 @@ def get_parser():
action="store_true",
help="Dump Triton IR",
)
parser.add_argument(
"--gpu-lockdown",
action="store_true",
help="Lock down GPU frequency and clocks to avoid throttling."
)
if not hasattr(torch_version, "git_version"):
parser.add_argument("--log-scuba", action="store_true", help="Log to scuba.")
return parser
Expand Down Expand Up @@ -181,4 +187,5 @@ def run(args: List[str] = []):
from .ci import run_ci
run_ci()
return
_run(args, extra_args)
with gpu_lockdown(args.gpu_lockdown):
_run(args, extra_args)

0 comments on commit 2698ff8

Please sign in to comment.