Skip to content

Commit

Permalink
Make NCU runs more likely to succeed
Browse files Browse the repository at this point in the history
Reviewed By: chenyang78

Differential Revision: D58972699

fbshipit-source-id: 0c2c77bcb1f3d7e877d84498ad55ee0de5200471
  • Loading branch information
int3 authored and facebook-github-bot committed Jun 25, 2024
1 parent 6de6dd2 commit 4cda064
Showing 1 changed file with 19 additions and 12 deletions.
31 changes: 19 additions & 12 deletions torchbenchmark/util/triton_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,18 +914,23 @@ def ncu_trace(self, input_id: int, fn_name: str, replay: bool=False) -> str:
]
)
# Disable DCGM
try:
disable_dcgm = [
"sudo",
"dyno",
"dcgm_profiling",
"--mute=true",
"--duration=100000_s",
]
subprocess.run(disable_dcgm, check=True)
except subprocess.SubprocessError:
disable_dyno_dcgm = [
"sudo",
"dyno",
"dcgm_profiling",
"--mute=true",
"--duration=100000_s",
]
disable_dcgm_service = [
"sudo",
"systemctl",
"stop",
"nvidia-dcgm",
]
if subprocess.run(disable_dyno_dcgm).returncode != 0 and \
subprocess.run(disable_dcgm_service).returncode != 0:
warnings.warn(
"Cannot find dyno to disable DCGM. Proceed to collect NCU Trace."
"DCGM may not have been successfully disabled. Proceeding to collect NCU trace anyway..."
)
ncu_output_dir = self.get_temp_path("ncu_traces/{fn_name}_{input_id}")
ncu_output_dir.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -958,7 +963,9 @@ def ncu_trace(self, input_id: int, fn_name: str, replay: bool=False) -> str:
])
ncu_args.extend(op_task_args)
logger.info("Running NCU: %s", shlex.join(ncu_args))
subprocess.check_call(ncu_args)
# Sometimes, `ncu --target-processes all` will fail with the message "Failed to connect to process". Setting
# CUDA_INJECTION64_PATH=none seems to fix this issue.
subprocess.check_call(ncu_args, env={**os.environ, "CUDA_INJECTION64_PATH": "none"})
return str(ncu_output_file.resolve())

def kineto_trace(self, input_id: int, fn: Callable) -> str:
Expand Down

0 comments on commit 4cda064

Please sign in to comment.