Skip to content

Commit

Permalink
Throwaway dbg commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jbieniusiewi committed Oct 16, 2024
1 parent 38c2ccb commit d6b5f85
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ jobs:
pip install pytest lightning
PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))")
pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl
- name: Run Fault Tolerance unit tests
run: |
pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/
#- name: Run Fault Tolerance unit tests
# run: |
# pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/
- name: Run Straggler unit tests
run: |
pytest -s -vvv -m "not gpu" ./tests/straggler/unit/
- name: Run PTL callbacks unit tests
run: |
pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/
pytest -s -vvv -m "not gpu" ./tests/straggler/unit/test_data_shared.py
#- name: Run PTL callbacks unit tests
# run: |
# pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/
6 changes: 6 additions & 0 deletions tests/straggler/unit/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,16 @@ def distributed_worker(

ready_flag.set()

print(f"RANK {rank} STARTS WORKER FN", flush=True, file=sys.stderr)

worker_fn(**kwargs)

print(f"RANK {rank} BEFORE destroy_process_group", flush=True, file=sys.stderr)

torch.distributed.destroy_process_group()

print(f"RANK {rank} AFTER destroy_process_group", flush=True, file=sys.stderr)

sys.exit(0)


Expand Down

0 comments on commit d6b5f85

Please sign in to comment.