Skip to content

Commit

Permalink
WIP fix rocm runners (#249)
Browse files Browse the repository at this point in the history
Co-authored-by: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
  • Loading branch information
baptistecolle and IlyasMoutawwakil authored Aug 30, 2024
1 parent 4d3f68e commit 64616c2
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 90 deletions.
44 changes: 16 additions & 28 deletions .github/workflows/test_api_rocm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,43 +25,31 @@ concurrency:
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}

env:
IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm

jobs:
build_image_and_run_api_rocm_tests:
runs-on: [single-gpu, amd-gpu, mi250, ci]
run_api_rocm_tests:
runs-on: [self-hosted, amd-gpu, single-gpu, mi250]

container:
image: ghcr.io/huggingface/optimum-benchmark:latest-rocm
options: --ipc host
--shm-size "16gb"
--group-add video
--device /dev/kfd
--device /dev/dri
--env ROCR_VISIBLE_DEVICES
--volume /mnt/cache/.cache/huggingface:/mnt/cache/

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set target devices
run: |
echo "DEVICE:$DEVICE"
echo "DEVICE=$DEVICE" >> $GITHUB_ENV
- name: Unroot docker image
- name: Install dependencies
run: |
docker build --build-arg IMAGE=${{ env.IMAGE }} --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t ${{ env.IMAGE }}-unroot docker/unroot
pip install -e .[testing,timm,diffusers,codecarbon]
- name: Run tests
uses: addnab/docker-run-action@v3
env:
DEVICE: ${{ env.DEVICE }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PUSH_REPO_ID: optimum-benchmark/rocm
with:
image: ${{ env.IMAGE }}-unroot
options: |
--rm
--shm-size 64G
--env HF_TOKEN
--env PUSH_REPO_ID
--device /dev/kfd
--device /dev/dri/${{ env.DEVICE }}
--volume ${{ github.workspace }}:/workspace
--workdir /workspace
run: |
pip install -e .[testing,timm,diffusers,codecarbon]
pytest -s -x -k "api and cuda"
run: |
pytest -s -x -k "api and cuda"
46 changes: 17 additions & 29 deletions .github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,43 +25,31 @@ concurrency:
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}

env:
IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm

jobs:
run_cli_rocm_pytorch_multi_gpu_tests:
runs-on: [multi-gpu, amd-gpu, mi250, ci]
runs-on: [self-hosted, amd-gpu, multi-gpu]

container:
image: ghcr.io/huggingface/optimum-benchmark:latest-rocm
options: --ipc host
--shm-size "16gb"
--group-add video
--device /dev/kfd
--device /dev/dri
--env ROCR_VISIBLE_DEVICES
--volume /mnt/cache/.cache/huggingface:/mnt/cache/

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set target devices
run: |
echo "DEVICE0:$DEVICE0"
echo "DEVICE1:$DEVICE1"
echo "DEVICE0=$DEVICE0" >> $GITHUB_ENV
echo "DEVICE1=$DEVICE1" >> $GITHUB_ENV
- name: Unroot docker image
- name: Install dependencies
run: |
docker build --build-arg IMAGE=${{ env.IMAGE }} --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t ${{ env.IMAGE }}-unroot docker/unroot
pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq]
- name: Run tests
uses: addnab/docker-run-action@v3
env:
DEVICE0: ${{ env.DEVICE0 }}
DEVICE1: ${{ env.DEVICE1 }}
with:
image: ${{ env.IMAGE }}-unroot
options: |
--rm
--shm-size 64G
--device /dev/kfd
--device /dev/dri/${{ env.DEVICE0 }}
--device /dev/dri/${{ env.DEVICE1 }}
--volume ${{ github.workspace }}:/workspace
--workdir /workspace
run: |
pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq]
pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PUSH_REPO_ID: optimum-benchmark/rocm
run: |
pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
42 changes: 17 additions & 25 deletions .github/workflows/test_cli_rocm_pytorch_single_gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,39 +25,31 @@ concurrency:
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}

env:
IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm

jobs:
run_cli_rocm_pytorch_single_gpu_tests:
runs-on: [single-gpu, amd-gpu, mi250, ci]
runs-on: [self-hosted, amd-gpu, single-gpu]

container:
image: ghcr.io/huggingface/optimum-benchmark:latest-rocm
options: --ipc host
--shm-size "16gb"
--group-add video
--device /dev/kfd
--device /dev/dri
--env ROCR_VISIBLE_DEVICES
--volume /mnt/cache/.cache/huggingface:/mnt/cache/

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set target devices
run: |
echo "DEVICE:$DEVICE"
echo "DEVICE=$DEVICE" >> $GITHUB_ENV
- name: Unroot docker image
- name: Install dependencies
run: |
docker build --build-arg IMAGE=${{ env.IMAGE }} --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t ${{ env.IMAGE }}-unroot docker/unroot
pip install -e .[testing,diffusers,timm,peft,autoawq,auto-gptq]
- name: Run tests
uses: addnab/docker-run-action@v3
env:
DEVICE: ${{ env.DEVICE }}
with:
image: ${{ env.IMAGE }}-unroot
options: |
--rm
--shm-size 64G
--device /dev/kfd
--device /dev/dri/${{ env.DEVICE }}
--volume ${{ github.workspace }}:/workspace
--workdir /workspace
run: |
pip install -e .[testing,diffusers,timm,peft,autoawq,auto-gptq]
pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PUSH_REPO_ID: optimum-benchmark/rocm
run: |
pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
2 changes: 1 addition & 1 deletion docker/rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco

# Install PyTorch
ARG TORCH_ROCM=rocm5.7
ARG TORCH_VERSION=2.2.2
ARG TORCH_VERSION=stable

RUN if [ "${TORCH_VERSION}" = "stable" ]; then \
pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
Expand Down
11 changes: 9 additions & 2 deletions optimum_benchmark/launchers/device_isolation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,24 +65,31 @@ def get_amd_devices_pids(device_ids: str) -> Set[int]:
)

amdsmi.amdsmi_init()
permission_denied = False

devices_pids = set()
devices_ids = list(map(int, device_ids.split(",")))

processor_handles = amdsmi.amdsmi_get_processor_handles()
for device_id in devices_ids:
processor_handle = processor_handles[device_id]

if permission_denied:
continue

try:
# these functions fail a lot for no apparent reason
processes_handles = amdsmi.amdsmi_get_gpu_process_list(processor_handle)
except Exception:
except Exception as e:
permission_denied = "Permission denied" in str(e)
continue

for process_handle in processes_handles:
try:
# these functions fail a lot for no apparent reason
info = amdsmi.amdsmi_get_gpu_process_info(processor_handle, process_handle)
except Exception:
except Exception as e:
permission_denied = "Permission denied" in str(e)
continue

if info["memory_usage"]["vram_mem"] == 4096:
Expand Down
17 changes: 12 additions & 5 deletions optimum_benchmark/trackers/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connectio

amdsmi.amdsmi_init()
rocml.smi_initialize()
permission_denied = False
devices_handles = amdsmi.amdsmi_get_processor_handles()

while not stop:
Expand All @@ -300,27 +301,33 @@ def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connectio

for device_id in device_ids:
device_handle = devices_handles[device_id]

try:
used_global_memory += rocml.smi_get_device_memory_used(device_id)
except Exception as e:
LOGGER.warning(f"Could not get memory usage for device {device_id}: {e}")

if permission_denied:
continue

try:
processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle)
except Exception as e:
LOGGER.warning(f"Could not get process list for device {device_id}: {e}")
permission_denied = "Permission Denied" in str(e)
continue

for process_handle in processes_handles:
try:
gpu_process_info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle)
except Exception as e:
LOGGER.warning(f"Could not get process info for process {process_handle}: {e}")
permission_denied = "Permission Denied" in str(e)
continue

if gpu_process_info["pid"] in monitored_pids:
max_used_process_memory += gpu_process_info["memory_usage"]["vram_mem"]

try:
used_global_memory += rocml.smi_get_device_memory_used(device_id)
except Exception as e:
LOGGER.warning(f"Could not get memory usage for device {device_id}: {e}")

max_used_global_memory = max(max_used_global_memory, used_global_memory)
max_used_process_memory = max(max_used_process_memory, used_process_memory)
stop = connection.poll(interval)
Expand Down

0 comments on commit 64616c2

Please sign in to comment.