Update ROCm (#253)

huggingface · Sep 4, 2024 · 6dfec54 · 6dfec54
1 parent ad8304e
commit 6dfec54
Show file tree

Hide file tree

Showing 14 changed files with 85 additions and 38 deletions.
diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml
@@ -27,7 +27,7 @@ concurrency:
 
 jobs:
   run_api_rocm_tests:
-    runs-on: [self-hosted, amd-gpu, single-gpu, mi250]
+    runs-on: [self-hosted, amd-gpu, single-gpu]
 
     container:
       image: ghcr.io/huggingface/optimum-benchmark:latest-rocm
@@ -37,6 +37,7 @@ jobs:
         --device /dev/kfd
         --device /dev/dri
         --env ROCR_VISIBLE_DEVICES
+        --env HIP_VISIBLE_DEVICES=0
         --volume /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:

diff --git a/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml
@@ -37,6 +37,7 @@ jobs:
         --device /dev/kfd
         --device /dev/dri
         --env ROCR_VISIBLE_DEVICES
+        --env HIP_VISIBLE_DEVICES=0,1
         --volume /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
@@ -45,7 +46,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq]
+          pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq] "deepspeed<0.15"
 
       - name: Run tests
         env:

diff --git a/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml
@@ -37,6 +37,7 @@ jobs:
         --device /dev/kfd
         --device /dev/dri
         --env ROCR_VISIBLE_DEVICES
+        --env HIP_VISIBLE_DEVICES=0
         --volume /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
@@ -52,4 +53,4 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/rocm
         run: |
-          pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
+          pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq or gptq)"
diff --git a/Makefile b/Makefile
@@ -172,10 +172,10 @@ test_cli_rocm_pytorch_multi_gpu:
 test_cli_rocm_pytorch_single_gpu:
 	pytest -s -k "cli and rocm and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
 
-# llm-perf
 test_cli_llama_cpp:
 	pytest -s -k "llama_cpp"
 
+# llm-perf
 install_llm_perf_cuda_pytorch:
 	pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
 	pip install -U transformers huggingface_hub[hf_transfer]

diff --git a/docker/rocm/Dockerfile b/docker/rocm/Dockerfile
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG ROCM_VERSION=5.7.1
+ARG ROCM_VERSION=6.1.2
 ARG UBUNTU_VERSION=22.04
 
 FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}
 
 # Install necessary packages
 ENV PATH="/opt/rocm/bin:${PATH}"
-ENV DEBIAN_FRONTEND noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ 
     sudo build-essential git bash-completion \
     python3.10 python3-pip python3.10-dev && \
@@ -29,13 +29,13 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco
     cd /opt/rocm/share/amd_smi && pip install .
 
 # Install PyTorch
-ARG TORCH_ROCM=rocm5.7
+ARG TORCH_ROCM=rocm6.1
 ARG TORCH_VERSION=stable
 
 RUN if [ "${TORCH_VERSION}" = "stable" ]; then \
     pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
-elif [ "${TORCH_VERSION}" = "nighly" ]; then \
-    pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
+elif [ "${TORCH_VERSION}" = "nightly" ]; then \
+    pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/${TORCH_ROCM} ; \
 else \
     pip install --no-cache-dir torch==${TORCH_VERSION} torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
 fi
diff --git a/docker/unroot/Dockerfile b/docker/unroot/Dockerfile
@@ -1,4 +1,4 @@
-ARG IMAGE
+ARG IMAGE="optimum-benchmark:latest"
 
 FROM $IMAGE
 

diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
@@ -97,9 +97,10 @@ def __post_init__(self):
             if is_nvidia_system():
                 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
                 os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids
+                LOGGER.info(f"CUDA_VISIBLE_DEVICES was set to {os.environ['CUDA_VISIBLE_DEVICES']}.")
             elif is_rocm_system():
-                # https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html
                 os.environ["ROCR_VISIBLE_DEVICES"] = self.device_ids
+                LOGGER.info(f"ROCR_VISIBLE_DEVICES was set to {os.environ['ROCR_VISIBLE_DEVICES']}.")
             else:
                 raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.")
 

diff --git a/optimum_benchmark/launchers/base.py b/optimum_benchmark/launchers/base.py
@@ -39,10 +39,10 @@ def launch(self, worker: Callable[..., BenchmarkReport], worker_args: List[Any])
     @contextmanager
     def device_isolation(self, pid: int, device_ids: Optional[str] = None):
         if device_ids is None:
-            if is_nvidia_system():
-                device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-            elif is_rocm_system():
+            if is_rocm_system():
                 device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", None)
+            elif is_nvidia_system():
+                device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None)
 
         self.device_isolation_process = Process(
             target=assert_device_isolation,

diff --git a/optimum_benchmark/system_utils.py b/optimum_benchmark/system_utils.py
@@ -173,7 +173,9 @@ def get_gpu_vram_mb() -> List[int]:
 
 def get_gpu_device_ids() -> str:
     if is_nvidia_system():
-        if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
+        if os.environ.get("NVIDIA_VISIBLE_DEVICES", None) is not None:
+            device_ids = os.environ["NVIDIA_VISIBLE_DEVICES"]
+        elif os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
             device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
         else:
             if not is_pynvml_available():
@@ -187,12 +189,12 @@ def get_gpu_device_ids() -> str:
             device_ids = ",".join(str(i) for i in device_ids)
             pynvml.nvmlShutdown()
     elif is_rocm_system():
-        if os.environ.get("GPU_DEVICE_ORDINAL", None) is not None:
-            device_ids = os.environ["GPU_DEVICE_ORDINAL"]
+        if os.environ.get("ROCR_VISIBLE_DEVICES", None) is not None:
+            device_ids = os.environ["ROCR_VISIBLE_DEVICES"]
         elif os.environ.get("HIP_VISIBLE_DEVICES", None) is not None:
             device_ids = os.environ["HIP_VISIBLE_DEVICES"]
-        elif os.environ.get("ROCR_VISIBLE_DEVICES", None) is not None:
-            device_ids = os.environ["ROCR_VISIBLE_DEVICES"]
+        elif os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
+            device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
         else:
             if not is_amdsmi_available() or not is_pyrsmi_available():
                 raise ValueError(
@@ -201,17 +203,18 @@ def get_gpu_device_ids() -> str:
                     "or PyRSMI library from https://github.com/ROCm/pyrsmi."
                 )
 
-            if is_amdsmi_available():
+            if is_pyrsmi_available():
+                rocml.smi_initialize()
+                device_ids = list(range(rocml.smi_get_device_count()))
+                device_ids = ",".join(str(i) for i in device_ids)
+                rocml.smi_shutdown()
+
+            elif is_amdsmi_available():
                 amdsmi.amdsmi_init()
                 device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles())))
                 device_ids = ",".join(str(i) for i in device_ids)
                 amdsmi.amdsmi_shut_down()
 
-            elif is_pyrsmi_available():
-                rocml.smi_initialize()
-                device_ids = list(range(rocml.smi_get_device_count()))
-                device_ids = ",".join(str(i) for i in device_ids)
-                rocml.smi_shutdown()
     else:
         raise ValueError("Couldn't infer GPU device ids.")
 

diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
@@ -105,7 +105,7 @@ def aggregate(efficiencies: List["Efficiency"]) -> "Efficiency":
     def from_energy(energy: "Energy", volume: int, unit: str) -> "Efficiency":
         return Efficiency(value=volume / energy.total if energy.total > 0 else 0, unit=unit)
 
-    def log(self, prefix: str = "method"):
+    def log(self, prefix: str = ""):
         LOGGER.info(f"\t\t+ {prefix} energy efficiency: {self.value:f} ({self.unit})")
 
 

diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
@@ -76,7 +76,7 @@ def from_values(values: List[float], unit: str) -> "Latency":
             values=values,
         )
 
-    def log(self, prefix: str = "method"):
+    def log(self, prefix: str = ""):
         stdev_percentage = 100 * self.stdev / self.mean if self.mean > 0 else 0
         LOGGER.info(f"\t\t+ {prefix} latency:")
         LOGGER.info(f"\t\t\t- count: {self.count}")

diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
@@ -81,7 +81,7 @@ def aggregate(memories: List["Memory"]) -> "Memory":
             max_allocated=max_allocated,
         )
 
-    def log(self, prefix: str = "forward"):
+    def log(self, prefix: str = ""):
         LOGGER.info(f"\t\t+ {prefix} memory:")
         if self.max_ram is not None:
             LOGGER.info(f"\t\t\t- max RAM: {self.max_ram:f} ({self.unit})")
@@ -303,7 +303,12 @@ def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connectio
                 device_handle = devices_handles[device_id]
 
                 try:
-                    used_global_memory += rocml.smi_get_device_memory_used(device_id)
+                    if is_amdsmi_available():
+                        used_global_memory += amdsmi.amdsmi_get_gpu_memory_total(
+                            device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM
+                        )
+                    elif is_pyrsmi_available():
+                        used_global_memory += rocml.smi_get_device_memory_used(device_id, type="VRAM")
                 except Exception as e:
                     LOGGER.warning(f"Could not get memory usage for device {device_id}: {e}")
 

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -24,7 +24,7 @@
 from optimum_benchmark.import_utils import get_git_revision_hash
 from optimum_benchmark.scenarios.inference.config import INPUT_SHAPES
 from optimum_benchmark.scenarios.training.config import DATASET_SHAPES
-from optimum_benchmark.system_utils import get_gpu_device_ids
+from optimum_benchmark.system_utils import is_nvidia_system, is_rocm_system
 from optimum_benchmark.trackers import LatencyTracker, MemoryTracker
 
 PUSH_REPO_ID = os.environ.get("PUSH_REPO_ID", "optimum-benchmark/local")
@@ -47,9 +47,15 @@
 def test_api_launch(device, scenario, library, task, model):
     benchmark_name = f"{device}_{scenario}_{library}_{task}_{model}"
 
-    device_ids = get_gpu_device_ids() if device == "cuda" else None
-    no_weights = False if library != "transformers" else True
-    device_isolation = device == "cuda"
+    if device == "cuda":
+        device_isolation = True
+        if is_rocm_system():
+            device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", "0")
+        elif is_nvidia_system():
+            device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+    else:
+        device_isolation = False
+        device_ids = None
 
     launcher_config = ProcessConfig(device_isolation=device_isolation, device_isolation_action="error")
 
@@ -61,7 +67,7 @@ def test_api_launch(device, scenario, library, task, model):
 
     elif scenario == "inference":
         scenario_config = InferenceConfig(
-            energy=torch.version.hip is None,
+            energy=not is_rocm_system(),
             latency=True,
             memory=True,
             duration=1,
@@ -72,11 +78,21 @@ def test_api_launch(device, scenario, library, task, model):
             call_kwargs={"num_inference_steps": 2},
         )
 
+    no_weights = False if library != "transformers" else True
+
     backend_config = PyTorchConfig(
-        device=device, device_ids=device_ids, no_weights=no_weights, library=library, model=model, task=task
+        device=device,
+        device_ids=device_ids,
+        no_weights=no_weights,
+        library=library,
+        model=model,
+        task=task,
     )
     benchmark_config = BenchmarkConfig(
-        name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
+        name=benchmark_name,
+        scenario=scenario_config,
+        launcher=launcher_config,
+        backend=backend_config,
     )
     benchmark_report = Benchmark.launch(benchmark_config)
 
@@ -204,10 +220,20 @@ def test_api_latency_tracker(device, backend):
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
 @pytest.mark.parametrize("backend", ["pytorch", "other"])
 def test_api_memory_tracker(device, backend):
+    if device == "cuda" and backend == "other" and is_rocm_system():
+        pytest.skip("Measuring memory usage is only supported for PyTorch backend on ROCm system for now")
+
     if torch.cuda.is_available():
         reload(torch.cuda)
 
-    device_ids = get_gpu_device_ids() if device == "cuda" else None
+    if device == "cuda":
+        if is_rocm_system():
+            device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", "0")
+        elif is_nvidia_system():
+            device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+    else:
+        device_ids = None
+
     tracker = MemoryTracker(device=device, backend=backend, device_ids=device_ids)
 
     tracker.reset()
@@ -231,7 +257,7 @@ def test_api_memory_tracker(device, backend):
         if backend == "pytorch":
             measured_memory = final_memory.max_allocated - initial_memory.max_allocated
         else:
-            # because user namespace is not visible to pynvml/amdsmi, we use global vram
+            # namespace is not visible to pynvml/amdsmi, so we use global vram instead of process specific.
             measured_memory = final_memory.max_global_vram - initial_memory.max_global_vram
     else:
         measured_memory = final_memory.max_ram - initial_memory.max_ram

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -9,6 +9,7 @@
 
 LOGGER = getLogger("test")
 
+
 FORCE_SERIAL = os.environ.get("FORCE_SERIAL", "0") == "1"
 TEST_CONFIG_DIR = Path(__file__).parent / "configs"
 TEST_CONFIG_NAMES = [
@@ -17,6 +18,9 @@
     if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_"))
 ]
 
+ROCR_VISIBLE_DEVICES = os.environ.get("ROCR_VISIBLE_DEVICES", None)
+CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+
 
 @pytest.mark.parametrize("config_name", TEST_CONFIG_NAMES)
 def test_cli_configs(config_name):
@@ -37,6 +41,11 @@ def test_cli_configs(config_name):
     else:
         args += ["hydra.launcher.n_jobs=-1"]
 
+    if ROCR_VISIBLE_DEVICES is not None:
+        args += [f'backend.device_ids="{ROCR_VISIBLE_DEVICES}"']
+    elif CUDA_VISIBLE_DEVICES is not None:
+        args += [f'backend.device_ids="{CUDA_VISIBLE_DEVICES}"']
+
     popen = run_subprocess_and_log_stream_output(LOGGER, args)
     assert popen.returncode == 0, f"Failed to run {config_name}"