From d6b5f85953fca4a7d8a9ac0019591fa19beb3085 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Wed, 16 Oct 2024 14:02:39 +0200 Subject: [PATCH] Throwaway dbg commit --- .github/workflows/unit_test.yml | 14 +++++++------- tests/straggler/unit/_utils.py | 6 ++++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml index 9d4fb34..d9a347c 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit_test.yml @@ -75,12 +75,12 @@ jobs: pip install pytest lightning PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))") pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl - - name: Run Fault Tolerance unit tests - run: | - pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/ + #- name: Run Fault Tolerance unit tests + # run: | + # pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/ - name: Run Straggler unit tests run: | - pytest -s -vvv -m "not gpu" ./tests/straggler/unit/ - - name: Run PTL callbacks unit tests - run: | - pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/ + pytest -s -vvv -m "not gpu" ./tests/straggler/unit/test_data_shared.py + #- name: Run PTL callbacks unit tests + # run: | + # pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/ diff --git a/tests/straggler/unit/_utils.py b/tests/straggler/unit/_utils.py index bba1613..516b104 100644 --- a/tests/straggler/unit/_utils.py +++ b/tests/straggler/unit/_utils.py @@ -111,10 +111,16 @@ def distributed_worker( ready_flag.set() + print(f"RANK {rank} STARTS WORKER FN", flush=True, file=sys.stderr) + worker_fn(**kwargs) + print(f"RANK {rank} BEFORE destroy_process_group", flush=True, file=sys.stderr) + torch.distributed.destroy_process_group() + print(f"RANK {rank} AFTER destroy_process_group", flush=True, file=sys.stderr) + sys.exit(0)