Skip to content

Commit

Permalink
Update ROCm (#253)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Sep 4, 2024
1 parent ad8304e commit 6dfec54
Show file tree
Hide file tree
Showing 14 changed files with 85 additions and 38 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/test_api_rocm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ concurrency:

jobs:
run_api_rocm_tests:
runs-on: [self-hosted, amd-gpu, single-gpu, mi250]
runs-on: [self-hosted, amd-gpu, single-gpu]

container:
image: ghcr.io/huggingface/optimum-benchmark:latest-rocm
Expand All @@ -37,6 +37,7 @@ jobs:
--device /dev/kfd
--device /dev/dri
--env ROCR_VISIBLE_DEVICES
--env HIP_VISIBLE_DEVICES=0
--volume /mnt/cache/.cache/huggingface:/mnt/cache/

steps:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_cli_rocm_pytorch_multi_gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
--device /dev/kfd
--device /dev/dri
--env ROCR_VISIBLE_DEVICES
--env HIP_VISIBLE_DEVICES=0,1
--volume /mnt/cache/.cache/huggingface:/mnt/cache/

steps:
Expand All @@ -45,7 +46,7 @@ jobs:

- name: Install dependencies
run: |
pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq]
pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq] "deepspeed<0.15"
- name: Run tests
env:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_cli_rocm_pytorch_single_gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
--device /dev/kfd
--device /dev/dri
--env ROCR_VISIBLE_DEVICES
--env HIP_VISIBLE_DEVICES=0
--volume /mnt/cache/.cache/huggingface:/mnt/cache/

steps:
Expand All @@ -52,4 +53,4 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PUSH_REPO_ID: optimum-benchmark/rocm
run: |
pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"
pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq or gptq)"
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,10 @@ test_cli_rocm_pytorch_multi_gpu:
test_cli_rocm_pytorch_single_gpu:
pytest -s -k "cli and rocm and pytorch and not (dp or ddp or device_map or deepspeed) and not (bnb or awq)"

# llm-perf
test_cli_llama_cpp:
pytest -s -k "llama_cpp"

# llm-perf
install_llm_perf_cuda_pytorch:
pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
pip install -U transformers huggingface_hub[hf_transfer]
Expand Down
10 changes: 5 additions & 5 deletions docker/rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG ROCM_VERSION=5.7.1
ARG ROCM_VERSION=6.1.2
ARG UBUNTU_VERSION=22.04

FROM rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}

# Install necessary packages
ENV PATH="/opt/rocm/bin:${PATH}"
ENV DEBIAN_FRONTEND noninteractive
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
sudo build-essential git bash-completion \
python3.10 python3-pip python3.10-dev && \
Expand All @@ -29,13 +29,13 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco
cd /opt/rocm/share/amd_smi && pip install .

# Install PyTorch
ARG TORCH_ROCM=rocm5.7
ARG TORCH_ROCM=rocm6.1
ARG TORCH_VERSION=stable

RUN if [ "${TORCH_VERSION}" = "stable" ]; then \
pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
elif [ "${TORCH_VERSION}" = "nighly" ]; then \
pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
elif [ "${TORCH_VERSION}" = "nightly" ]; then \
pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/${TORCH_ROCM} ; \
else \
pip install --no-cache-dir torch==${TORCH_VERSION} torchvision torchaudio --index-url https://download.pytorch.org/whl/${TORCH_ROCM} ; \
fi
2 changes: 1 addition & 1 deletion docker/unroot/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG IMAGE
ARG IMAGE="optimum-benchmark:latest"

FROM $IMAGE

Expand Down
3 changes: 2 additions & 1 deletion optimum_benchmark/backends/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,10 @@ def __post_init__(self):
if is_nvidia_system():
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids
LOGGER.info(f"CUDA_VISIBLE_DEVICES was set to {os.environ['CUDA_VISIBLE_DEVICES']}.")
elif is_rocm_system():
# https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html
os.environ["ROCR_VISIBLE_DEVICES"] = self.device_ids
LOGGER.info(f"ROCR_VISIBLE_DEVICES was set to {os.environ['ROCR_VISIBLE_DEVICES']}.")
else:
raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.")

Expand Down
6 changes: 3 additions & 3 deletions optimum_benchmark/launchers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ def launch(self, worker: Callable[..., BenchmarkReport], worker_args: List[Any])
@contextmanager
def device_isolation(self, pid: int, device_ids: Optional[str] = None):
if device_ids is None:
if is_nvidia_system():
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None)
elif is_rocm_system():
if is_rocm_system():
device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", None)
elif is_nvidia_system():
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None)

self.device_isolation_process = Process(
target=assert_device_isolation,
Expand Down
25 changes: 14 additions & 11 deletions optimum_benchmark/system_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,9 @@ def get_gpu_vram_mb() -> List[int]:

def get_gpu_device_ids() -> str:
if is_nvidia_system():
if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
if os.environ.get("NVIDIA_VISIBLE_DEVICES", None) is not None:
device_ids = os.environ["NVIDIA_VISIBLE_DEVICES"]
elif os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
else:
if not is_pynvml_available():
Expand All @@ -187,12 +189,12 @@ def get_gpu_device_ids() -> str:
device_ids = ",".join(str(i) for i in device_ids)
pynvml.nvmlShutdown()
elif is_rocm_system():
if os.environ.get("GPU_DEVICE_ORDINAL", None) is not None:
device_ids = os.environ["GPU_DEVICE_ORDINAL"]
if os.environ.get("ROCR_VISIBLE_DEVICES", None) is not None:
device_ids = os.environ["ROCR_VISIBLE_DEVICES"]
elif os.environ.get("HIP_VISIBLE_DEVICES", None) is not None:
device_ids = os.environ["HIP_VISIBLE_DEVICES"]
elif os.environ.get("ROCR_VISIBLE_DEVICES", None) is not None:
device_ids = os.environ["ROCR_VISIBLE_DEVICES"]
elif os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
else:
if not is_amdsmi_available() or not is_pyrsmi_available():
raise ValueError(
Expand All @@ -201,17 +203,18 @@ def get_gpu_device_ids() -> str:
"or PyRSMI library from https://github.com/ROCm/pyrsmi."
)

if is_amdsmi_available():
if is_pyrsmi_available():
rocml.smi_initialize()
device_ids = list(range(rocml.smi_get_device_count()))
device_ids = ",".join(str(i) for i in device_ids)
rocml.smi_shutdown()

elif is_amdsmi_available():
amdsmi.amdsmi_init()
device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles())))
device_ids = ",".join(str(i) for i in device_ids)
amdsmi.amdsmi_shut_down()

elif is_pyrsmi_available():
rocml.smi_initialize()
device_ids = list(range(rocml.smi_get_device_count()))
device_ids = ",".join(str(i) for i in device_ids)
rocml.smi_shutdown()
else:
raise ValueError("Couldn't infer GPU device ids.")

Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/trackers/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def aggregate(efficiencies: List["Efficiency"]) -> "Efficiency":
def from_energy(energy: "Energy", volume: int, unit: str) -> "Efficiency":
return Efficiency(value=volume / energy.total if energy.total > 0 else 0, unit=unit)

def log(self, prefix: str = "method"):
def log(self, prefix: str = ""):
LOGGER.info(f"\t\t+ {prefix} energy efficiency: {self.value:f} ({self.unit})")


Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/trackers/latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def from_values(values: List[float], unit: str) -> "Latency":
values=values,
)

def log(self, prefix: str = "method"):
def log(self, prefix: str = ""):
stdev_percentage = 100 * self.stdev / self.mean if self.mean > 0 else 0
LOGGER.info(f"\t\t+ {prefix} latency:")
LOGGER.info(f"\t\t\t- count: {self.count}")
Expand Down
9 changes: 7 additions & 2 deletions optimum_benchmark/trackers/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def aggregate(memories: List["Memory"]) -> "Memory":
max_allocated=max_allocated,
)

def log(self, prefix: str = "forward"):
def log(self, prefix: str = ""):
LOGGER.info(f"\t\t+ {prefix} memory:")
if self.max_ram is not None:
LOGGER.info(f"\t\t\t- max RAM: {self.max_ram:f} ({self.unit})")
Expand Down Expand Up @@ -303,7 +303,12 @@ def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connectio
device_handle = devices_handles[device_id]

try:
used_global_memory += rocml.smi_get_device_memory_used(device_id)
if is_amdsmi_available():
used_global_memory += amdsmi.amdsmi_get_gpu_memory_total(
device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM
)
elif is_pyrsmi_available():
used_global_memory += rocml.smi_get_device_memory_used(device_id, type="VRAM")
except Exception as e:
LOGGER.warning(f"Could not get memory usage for device {device_id}: {e}")

Expand Down
44 changes: 35 additions & 9 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from optimum_benchmark.import_utils import get_git_revision_hash
from optimum_benchmark.scenarios.inference.config import INPUT_SHAPES
from optimum_benchmark.scenarios.training.config import DATASET_SHAPES
from optimum_benchmark.system_utils import get_gpu_device_ids
from optimum_benchmark.system_utils import is_nvidia_system, is_rocm_system
from optimum_benchmark.trackers import LatencyTracker, MemoryTracker

PUSH_REPO_ID = os.environ.get("PUSH_REPO_ID", "optimum-benchmark/local")
Expand All @@ -47,9 +47,15 @@
def test_api_launch(device, scenario, library, task, model):
benchmark_name = f"{device}_{scenario}_{library}_{task}_{model}"

device_ids = get_gpu_device_ids() if device == "cuda" else None
no_weights = False if library != "transformers" else True
device_isolation = device == "cuda"
if device == "cuda":
device_isolation = True
if is_rocm_system():
device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", "0")
elif is_nvidia_system():
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
else:
device_isolation = False
device_ids = None

launcher_config = ProcessConfig(device_isolation=device_isolation, device_isolation_action="error")

Expand All @@ -61,7 +67,7 @@ def test_api_launch(device, scenario, library, task, model):

elif scenario == "inference":
scenario_config = InferenceConfig(
energy=torch.version.hip is None,
energy=not is_rocm_system(),
latency=True,
memory=True,
duration=1,
Expand All @@ -72,11 +78,21 @@ def test_api_launch(device, scenario, library, task, model):
call_kwargs={"num_inference_steps": 2},
)

no_weights = False if library != "transformers" else True

backend_config = PyTorchConfig(
device=device, device_ids=device_ids, no_weights=no_weights, library=library, model=model, task=task
device=device,
device_ids=device_ids,
no_weights=no_weights,
library=library,
model=model,
task=task,
)
benchmark_config = BenchmarkConfig(
name=benchmark_name, scenario=scenario_config, launcher=launcher_config, backend=backend_config
name=benchmark_name,
scenario=scenario_config,
launcher=launcher_config,
backend=backend_config,
)
benchmark_report = Benchmark.launch(benchmark_config)

Expand Down Expand Up @@ -204,10 +220,20 @@ def test_api_latency_tracker(device, backend):
@pytest.mark.parametrize("device", ["cpu", "cuda"])
@pytest.mark.parametrize("backend", ["pytorch", "other"])
def test_api_memory_tracker(device, backend):
if device == "cuda" and backend == "other" and is_rocm_system():
pytest.skip("Measuring memory usage is only supported for PyTorch backend on ROCm system for now")

if torch.cuda.is_available():
reload(torch.cuda)

device_ids = get_gpu_device_ids() if device == "cuda" else None
if device == "cuda":
if is_rocm_system():
device_ids = os.environ.get("ROCR_VISIBLE_DEVICES", "0")
elif is_nvidia_system():
device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
else:
device_ids = None

tracker = MemoryTracker(device=device, backend=backend, device_ids=device_ids)

tracker.reset()
Expand All @@ -231,7 +257,7 @@ def test_api_memory_tracker(device, backend):
if backend == "pytorch":
measured_memory = final_memory.max_allocated - initial_memory.max_allocated
else:
# because user namespace is not visible to pynvml/amdsmi, we use global vram
# namespace is not visible to pynvml/amdsmi, so we use global vram instead of process specific.
measured_memory = final_memory.max_global_vram - initial_memory.max_global_vram
else:
measured_memory = final_memory.max_ram - initial_memory.max_ram
Expand Down
9 changes: 9 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

LOGGER = getLogger("test")


FORCE_SERIAL = os.environ.get("FORCE_SERIAL", "0") == "1"
TEST_CONFIG_DIR = Path(__file__).parent / "configs"
TEST_CONFIG_NAMES = [
Expand All @@ -17,6 +18,9 @@
if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_"))
]

ROCR_VISIBLE_DEVICES = os.environ.get("ROCR_VISIBLE_DEVICES", None)
CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)


@pytest.mark.parametrize("config_name", TEST_CONFIG_NAMES)
def test_cli_configs(config_name):
Expand All @@ -37,6 +41,11 @@ def test_cli_configs(config_name):
else:
args += ["hydra.launcher.n_jobs=-1"]

if ROCR_VISIBLE_DEVICES is not None:
args += [f'backend.device_ids="{ROCR_VISIBLE_DEVICES}"']
elif CUDA_VISIBLE_DEVICES is not None:
args += [f'backend.device_ids="{CUDA_VISIBLE_DEVICES}"']

popen = run_subprocess_and_log_stream_output(LOGGER, args)
assert popen.returncode == 0, f"Failed to run {config_name}"

Expand Down

0 comments on commit 6dfec54

Please sign in to comment.