-
Notifications
You must be signed in to change notification settings - Fork 278
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add
--gpu-lockdown
option to pin GPU frequency and add power limit
Summary: Use bertmaher's script P1491504558 and an option `--gpu-lockdown` to lockdown GPU frequency and power limit. Reviewed By: int3 Differential Revision: D60301004 fbshipit-source-id: ff2e303c9d6570c0938919deed9c870c2a61b6b8
- Loading branch information
1 parent
89cdb8a
commit 2698ff8
Showing
2 changed files
with
67 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from contextlib import contextmanager | ||
import os | ||
import subprocess | ||
import logging | ||
|
||
CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0") | ||
|
||
POWER_LIMIT = { | ||
"NVIDIA PG509-210": "330", | ||
"NVIDIA A100": "330", | ||
"NVIDIA H100": "650", | ||
} | ||
FREQ_LIMIT = { | ||
"NVIDIA PG509-210": "1410", | ||
"NVIDIA A100": "1410", | ||
"NVIDIA H100": "1980", | ||
} | ||
|
||
def _set_pm(): | ||
command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "-pm", "1"] | ||
subprocess.check_call(command) | ||
|
||
def _set_power(gpu_info: str): | ||
command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "--power-limit", POWER_LIMIT[gpu_info]] | ||
subprocess.check_call(command) | ||
|
||
def _set_clock(gpu_info: str): | ||
# lgc: lock gpu clocks | ||
command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "-lgc", FREQ_LIMIT[gpu_info]] | ||
subprocess.check_call(command) | ||
|
||
def _reset_clock(gpu_info: str): | ||
# rgc: reset gpu clocks | ||
command = ["sudo", "nvidia-smi", "-i", CUDA_VISIBLE_DEVICES, "-rgc"] | ||
subprocess.check_call(command) | ||
|
||
def _get_gpu_name() -> str: | ||
import pynvml | ||
pynvml.nvmlInit() | ||
gpu_id = CUDA_VISIBLE_DEVICES.split(",")[0] | ||
handle = pynvml.nvmlDeviceGetHandleByIndex(int(gpu_id)) | ||
return pynvml.nvmlDeviceGetName(handle).decode("utf-8") | ||
|
||
@contextmanager | ||
def gpu_lockdown(enabled=True): | ||
try: | ||
if enabled: | ||
logging.info(f"[tritonbench] Locking down GPU {CUDA_VISIBLE_DEVICES}") | ||
gpu_name = _get_gpu_name() | ||
assert gpu_name in POWER_LIMIT, f"Unsupported GPU {gpu_name}" | ||
_set_pm() | ||
_set_power(gpu_name) | ||
_set_clock(gpu_name) | ||
yield | ||
finally: | ||
if enabled: | ||
_reset_clock(gpu_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters