From ac7a55df4dfac8ba3fa1cf17b0ea74300d7f9980 Mon Sep 17 00:00:00 2001 From: "Jason T. Greene" Date: Mon, 12 Aug 2024 04:38:40 +0000 Subject: [PATCH] fix: resolve fork failures during training runs torchrun jobs create a number of children per GPU which can often exceed the 2k limit. Signed-off-by: Jason T. Greene --- training/ilab-wrapper/ilab | 1 + training/nvidia-bootc/duplicated/ilab-wrapper/ilab | 1 + 2 files changed, 2 insertions(+) diff --git a/training/ilab-wrapper/ilab b/training/ilab-wrapper/ilab index aabd1c26..6527f839 100755 --- a/training/ilab-wrapper/ilab +++ b/training/ilab-wrapper/ilab @@ -128,6 +128,7 @@ PODMAN_COMMAND=("sudo" "--preserve-env=$PRESERVE_ENV" "podman" "run" "--rm" "-it "--device" "${CONTAINER_DEVICE}" "--security-opt" "label=disable" "--net" "host" "--shm-size" "10G" + "--pids-limit" "-1" "-v" "$HOME:$HOME" "${ADDITIONAL_MOUNT_OPTIONS[@]}" # This is intentionally NOT using "--env" "HOME" because we want the HOME diff --git a/training/nvidia-bootc/duplicated/ilab-wrapper/ilab b/training/nvidia-bootc/duplicated/ilab-wrapper/ilab index aabd1c26..6527f839 100755 --- a/training/nvidia-bootc/duplicated/ilab-wrapper/ilab +++ b/training/nvidia-bootc/duplicated/ilab-wrapper/ilab @@ -128,6 +128,7 @@ PODMAN_COMMAND=("sudo" "--preserve-env=$PRESERVE_ENV" "podman" "run" "--rm" "-it "--device" "${CONTAINER_DEVICE}" "--security-opt" "label=disable" "--net" "host" "--shm-size" "10G" + "--pids-limit" "-1" "-v" "$HOME:$HOME" "${ADDITIONAL_MOUNT_OPTIONS[@]}" # This is intentionally NOT using "--env" "HOME" because we want the HOME