WIP fix rocm runners (#249)

Co-authored-by: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
huggingface · Aug 30, 2024 · 64616c2 · 64616c2
1 parent 4d3f68e
commit 64616c2
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 90 deletions.
diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml
@@ -25,43 +25,31 @@ concurrency:
   cancel-in-progress: true
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm
-
 jobs:
-  build_image_and_run_api_rocm_tests:
-    runs-on: [single-gpu, amd-gpu, mi250, ci]
+  run_api_rocm_tests:
+    runs-on: [self-hosted, amd-gpu, single-gpu, mi250]
+
+    container:
+      image: ghcr.io/huggingface/optimum-benchmark:latest-rocm
+      options: --ipc host
+        --shm-size "16gb"
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --env ROCR_VISIBLE_DEVICES
+        --volume /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Set target devices
-        run: |
-          echo "DEVICE:$DEVICE"
-          echo "DEVICE=$DEVICE" >> $GITHUB_ENV
-
-      - name: Unroot docker image
+      - name: Install dependencies
         run: |
-          docker build --build-arg IMAGE=${{ env.IMAGE }} --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t ${{ env.IMAGE }}-unroot docker/unroot
+          pip install -e .[testing,timm,diffusers,codecarbon]
 
       - name: Run tests
-        uses: addnab/docker-run-action@v3
         env:
-          DEVICE: ${{ env.DEVICE }}
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/rocm
-        with:
-          image: ${{ env.IMAGE }}-unroot
-          options: |
-            --rm
-            --shm-size 64G
-            --env HF_TOKEN
-            --env PUSH_REPO_ID
-            --device /dev/kfd
-            --device /dev/dri/${{ env.DEVICE }}
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install -e .[testing,timm,diffusers,codecarbon]
-            pytest -s -x -k "api and cuda"
+        run: |
+          pytest -s -x -k "api and cuda"
diff --git a/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml
@@ -25,43 +25,31 @@ concurrency:
   cancel-in-progress: true
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm
-
 jobs:
   run_cli_rocm_pytorch_multi_gpu_tests:
-    runs-on: [multi-gpu, amd-gpu, mi250, ci]
+    runs-on: [self-hosted, amd-gpu, multi-gpu]
+
+    container:
+      image: ghcr.io/huggingface/optimum-benchmark:latest-rocm
+      options: --ipc host
+        --shm-size "16gb"
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --env ROCR_VISIBLE_DEVICES
+        --volume /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: Set target devices
-        run: |
-          echo "DEVICE0:$DEVICE0"
-          echo "DEVICE1:$DEVICE1"
-          echo "DEVICE0=$DEVICE0" >> $GITHUB_ENV
-          echo "DEVICE1=$DEVICE1" >> $GITHUB_ENV
-
-      - name: Unroot docker image
+      - name: Install dependencies
         run: |
-          docker build --build-arg IMAGE=${{ env.IMAGE }} --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t ${{ env.IMAGE }}-unroot docker/unroot
+          pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq]
 
       - name: Run tests
-        uses: addnab/docker-run-action@v3
         env:
-          DEVICE0: ${{ env.DEVICE0 }}
-          DEVICE1: ${{ env.DEVICE1 }}
-        with:
-          image: ${{ env.IMAGE }}-unroot
-          options: |
-            --rm
-            --shm-size 64G
-            --device /dev/kfd
-            --device /dev/dri/${{ env.DEVICE0 }}
-            --device /dev/dri/${{ env.DEVICE1 }}
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq]
-            pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          PUSH_REPO_ID: optimum-benchmark/rocm
+        run: |
+          pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
diff --git a/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml
@@ -25,39 +25,31 @@ concurrency:
   cancel-in-progress: true
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
-env:
-  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm
-
 jobs:
   run_cli_rocm_pytorch_single_gpu_tests:
-    runs-on: [single-gpu, amd-gpu, mi250, ci]
+    runs-on: [self-hosted, amd-gpu, single-gpu]
+
+    container:
+      image: ghcr.io/huggingface/optimum-benchmark:latest-rocm
+      options: --ipc host
+        --shm-size "16gb"
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --env ROCR_VISIBLE_DEVICES
+        --volume /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: Set target devices
-        run: |
-          echo "DEVICE:$DEVICE"
-          echo "DEVICE=$DEVICE" >> $GITHUB_ENV
-
-      - name: Unroot docker image
+      - name: Install dependencies
         run: |
-          docker build --build-arg IMAGE=${{ env.IMAGE }} --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t ${{ env.IMAGE }}-unroot docker/unroot
+          pip install -e .[testing,diffusers,timm,peft,autoawq,auto-gptq]
 
       - name: Run tests
-        uses: addnab/docker-run-action@v3
         env:
-          DEVICE: ${{ env.DEVICE }}
-        with:
-          image: ${{ env.IMAGE }}-unroot
-          options: |
-            --rm
-            --shm-size 64G
-            --device /dev/kfd
-            --device /dev/dri/${{ env.DEVICE }}
-            --volume ${{ github.workspace }}:/workspace
-            --workdir /workspace
-          run: |
-            pip install -e .[testing,diffusers,timm,peft,autoawq,auto-gptq]
-            pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          PUSH_REPO_ID: optimum-benchmark/rocm
+        run: |
+          pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
diff --git a/docker/rocm/Dockerfile b/docker/rocm/Dockerfile
@@ -30,7 +30,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco
 
 # Install PyTorch
 ARG TORCH_ROCM=rocm5.7
-ARG TORCH_VERSION=2.2.2
+ARG TORCH_VERSION=stable
 
 RUN if [ "${TORCH_VERSION}" = "stable" ]; then \
     pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \

diff --git a/optimum_benchmark/launchers/device_isolation_utils.py b/optimum_benchmark/launchers/device_isolation_utils.py
@@ -65,24 +65,31 @@ def get_amd_devices_pids(device_ids: str) -> Set[int]:
         )
 
     amdsmi.amdsmi_init()
+    permission_denied = False
 
     devices_pids = set()
     devices_ids = list(map(int, device_ids.split(",")))
 
     processor_handles = amdsmi.amdsmi_get_processor_handles()
     for device_id in devices_ids:
         processor_handle = processor_handles[device_id]
+
+        if permission_denied:
+            continue
+
         try:
             # these functions fail a lot for no apparent reason
             processes_handles = amdsmi.amdsmi_get_gpu_process_list(processor_handle)
-        except Exception:
+        except Exception as e:
+            permission_denied = "Permission denied" in str(e)
             continue
 
         for process_handle in processes_handles:
             try:
                 # these functions fail a lot for no apparent reason
                 info = amdsmi.amdsmi_get_gpu_process_info(processor_handle, process_handle)
-            except Exception:
+            except Exception as e:
+                permission_denied = "Permission denied" in str(e)
                 continue
 
             if info["memory_usage"]["vram_mem"] == 4096:

diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
@@ -290,6 +290,7 @@ def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connectio
 
         amdsmi.amdsmi_init()
         rocml.smi_initialize()
+        permission_denied = False
         devices_handles = amdsmi.amdsmi_get_processor_handles()
 
         while not stop:
@@ -300,27 +301,33 @@ def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connectio
 
             for device_id in device_ids:
                 device_handle = devices_handles[device_id]
+
+                try:
+                    used_global_memory += rocml.smi_get_device_memory_used(device_id)
+                except Exception as e:
+                    LOGGER.warning(f"Could not get memory usage for device {device_id}: {e}")
+
+                if permission_denied:
+                    continue
+
                 try:
                     processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle)
                 except Exception as e:
                     LOGGER.warning(f"Could not get process list for device {device_id}: {e}")
+                    permission_denied = "Permission Denied" in str(e)
                     continue
 
                 for process_handle in processes_handles:
                     try:
                         gpu_process_info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle)
                     except Exception as e:
                         LOGGER.warning(f"Could not get process info for process {process_handle}: {e}")
+                        permission_denied = "Permission Denied" in str(e)
                         continue
 
                     if gpu_process_info["pid"] in monitored_pids:
                         max_used_process_memory += gpu_process_info["memory_usage"]["vram_mem"]
 
-                try:
-                    used_global_memory += rocml.smi_get_device_memory_used(device_id)
-                except Exception as e:
-                    LOGGER.warning(f"Could not get memory usage for device {device_id}: {e}")
-
             max_used_global_memory = max(max_used_global_memory, used_global_memory)
             max_used_process_memory = max(max_used_process_memory, used_process_memory)
             stop = connection.poll(interval)