via ompi internal cuda support
$sudo apt install -y git autoconf automake libtool flex gcc g++ gfortran python3
$sudo ln -s /usr/bin/python3 /usr/bin/python
$./configure --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda \
--with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64/stubs --prefix=/path/to/ompi-to-install
# ## Initialize original environment variables
export PATH=/usr/local/bin:/usr/local/sbin:/usr/sbin:/usr/bin;
export CPATH=/usr/local/include;
export LD_LIBRARY_PATH=/usr/local/lib:/usr/lib64:/usr/lib32:/usr/libx32:/usr/lib
# ## ROCM environment set
export ROCM_PATH=/opt/rocm
alias module-rocm=" export PATH=$ROCM_PATH /bin:$PATH ;\
export CPATH=$ROCM_PATH /include:$CPATH ;\
export LD_LIBRARY_PATH=$ROCM_PATH /lib:$LD_LIBRARY_PATH "
compile from source and install ucx
$ git clone -b v1.11.x https://github.com/openucx/ucx.git
$ cd ucx && ./autogen.sh
$ mkdir build && cd build
$ ../contrib/configure-release --with-rocm=/opt/rocm --without-cuda --enable-optimizations \
--disable-logging --disable-debug --disable-assertions \
--disable-params-check --without-java --prefix=/path/to/install/ucx
$ make -j$( nproc) install
compile from source and install ROCM-AWARE OMPI
$ git clone --recursive -b v5.0.x https://github.com/open-mpi/ompi.git
$ cd ompi
$ ./autogen.pl
$ mkdir build && cd build && apt install flex
$ ../configure --enable-mca-no-build=btl-uct --enable-mpi1-compatibility \
--with-rocm=/opt/rocm --with-slurm --with-pmix \
--prefix=/path/to/install/ompi --with-ucx=/installed/ucx
CC=/opt/rocm/bin/amdclang CXX=/opt/rocm/bin/amdcalng++ FC=/opt/rocm/bin/amdflang
$ make -j$( nproc) install
$ tar xfz osu-micro-benchmarks-7.0.1.tar.gz
$ cd osu-micro-benchmarks-7.0.1 && ./configure --enable-rocm --with-rocm=/opt/rocm
CC=mpicc CXX=mpicxx
$ make -j$( nproc)
$ mpirun -np 2 --allow-run-as-root --mca btl ' ^openib' -x UCX_TLS=sm,self,rocm_copy,rocm_ipc --mca pml ucx ./c/mpi/pt2pt/osu_bw -d rocm D D