Skip to content

Commit

Permalink
Merge pull request #13 from DragonHPC/version-0.9
Browse files Browse the repository at this point in the history
Version 0.9
  • Loading branch information
kentdlee authored May 3, 2024
2 parents ca9f372 + 215a57a commit c608fba
Show file tree
Hide file tree
Showing 270 changed files with 23,066 additions and 3,044 deletions.
27 changes: 14 additions & 13 deletions .devcontainer/library-scripts/common-debian.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,9 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then
strace \
manpages \
manpages-dev \
init-system-helpers"

init-system-helpers \
capnproto"

# Needed for adding manpages-posix and manpages-posix-dev which are non-free packages in Debian
if [ "${ADD_NON_FREE_PACKAGES}" = "true" ]; then
# Bring in variables from /etc/os-release like VERSION_CODENAME
Expand All @@ -124,7 +125,7 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then
sed -i -E "s/deb-src http:\/\/(deb|httpredir)\.debian\.org\/debian ${VERSION_CODENAME}-updates main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME}-updates main contrib non-free/" /etc/apt/sources.list
sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list
sed -i "s/deb-src http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list
sed -i "s/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list
sed -i "s/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list
sed -i "s/deb-src http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list
# Handle bullseye location for security https://www.debian.org/releases/bullseye/amd64/release-notes/ch-information.en.html
sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main contrib non-free/" /etc/apt/sources.list
Expand All @@ -140,7 +141,7 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then
if [[ ! -z $(apt-cache --names-only search ^libssl1.1$) ]]; then
package_list="${package_list} libssl1.1"
fi

# Install appropriate version of libssl1.0.x if available
libssl_package=$(dpkg-query -f '${db:Status-Abbrev}\t${binary:Package}\n' -W 'libssl1\.0\.?' 2>&1 || echo '')
if [ "$(echo "$LIlibssl_packageBSSL" | grep -o 'libssl1\.0\.[0-9]:' | uniq | sort | wc -l)" -eq 0 ]; then
Expand All @@ -155,7 +156,7 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then

echo "Packages to verify are installed: ${package_list}"
apt-get -y install --no-install-recommends ${package_list} 2> >( grep -v 'debconf: delaying package configuration, since apt-utils is not installed' >&2 )

# Install git if not already installed (may be more recent than distro version)
if ! type git > /dev/null 2>&1; then
apt-get -y install --no-install-recommends git
Expand All @@ -174,7 +175,7 @@ fi
# Ensure at least the en_US.UTF-8 UTF-8 locale is available.
# Common need for both applications and things like the agnoster ZSH theme.
if [ "${LOCALE_ALREADY_SET}" != "true" ] && ! grep -o -E '^\s*en_US.UTF-8\s+UTF-8' /etc/locale.gen > /dev/null; then
echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen
echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen
locale-gen
LOCALE_ALREADY_SET="true"
fi
Expand All @@ -183,12 +184,12 @@ fi
group_name="${USERNAME}"
if id -u ${USERNAME} > /dev/null 2>&1; then
# User exists, update if needed
if [ "${USER_GID}" != "automatic" ] && [ "$USER_GID" != "$(id -g $USERNAME)" ]; then
if [ "${USER_GID}" != "automatic" ] && [ "$USER_GID" != "$(id -g $USERNAME)" ]; then
group_name="$(id -gn $USERNAME)"
groupmod --gid $USER_GID ${group_name}
usermod --gid $USER_GID $USERNAME
fi
if [ "${USER_UID}" != "automatic" ] && [ "$USER_UID" != "$(id -u $USERNAME)" ]; then
if [ "${USER_UID}" != "automatic" ] && [ "$USER_UID" != "$(id -u $USERNAME)" ]; then
usermod --uid $USER_UID $USERNAME
fi
else
Expand All @@ -198,7 +199,7 @@ else
else
groupadd --gid $USER_GID $USERNAME
fi
if [ "${USER_UID}" = "automatic" ]; then
if [ "${USER_UID}" = "automatic" ]; then
useradd -s /bin/bash --gid $USERNAME -m $USERNAME
else
useradd -s /bin/bash --uid $USER_UID --gid $USERNAME -m $USERNAME
Expand All @@ -213,7 +214,7 @@ if [ "${USERNAME}" != "root" ] && [ "${EXISTING_NON_ROOT_USER}" != "${USERNAME}"
fi

# ** Shell customization section **
if [ "${USERNAME}" = "root" ]; then
if [ "${USERNAME}" = "root" ]; then
user_rc_path="/root"
else
user_rc_path="/home/${USERNAME}"
Expand Down Expand Up @@ -250,9 +251,9 @@ fi
# Set the default git editor if not already set
if [ -z "$(git config --get core.editor)" ] && [ -z "${GIT_EDITOR}" ]; then
if [ "${TERM_PROGRAM}" = "vscode" ]; then
if [[ -n $(command -v code-insiders) && -z $(command -v code) ]]; then
if [[ -n $(command -v code-insiders) && -z $(command -v code) ]]; then
export GIT_EDITOR="code-insiders --wait"
else
else
export GIT_EDITOR="code --wait"
fi
fi
Expand Down Expand Up @@ -329,7 +330,7 @@ codespaces_zsh="$(cat \
# Codespaces zsh prompt theme
__zsh_prompt() {
local prompt_username
if [ ! -z "${GITHUB_USER}" ]; then
if [ ! -z "${GITHUB_USER}" ]; then
prompt_username="@${GITHUB_USER}"
else
prompt_username="%n"
Expand Down
22 changes: 16 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,14 @@ doc/internal/services/images/shepherd.png
doc/internal/services/transport_agent/images/transport_agent.png
doc/plantuml.jar
external/external_deps/
external/capnproto
external/pycapnp
src/release
src/include/capnp
src/include/kj
src/lib/message_defs.capnp.h
src/*egg*
src/bin/dragon_hsta
src/build/dragon_hsta
src/bin/dragon-hsta
src/dist/
src/doxygen/
src/dragon/dlogging/pydragon_logging.c
Expand All @@ -47,11 +52,16 @@ src/dragon/launcher/pydragon_pmsgqueue.c
src/dragon/pydragon_*.c
src/dragon/transport/hsta/__init__.c
src/dragon/transport/hsta/__main__.c
src/dragon/transport/hsta/crash-*
src/dragon/transport/hsta/dragon_hsta
src/dragon/transport/hsta/hsta_dbg.*.out
src/dragon/transport/hsta/ideas.txt
src/dragon/transport/hsta/leak-*
src/include/dragon/return_codes_map.h
src/bin
src/dragon/transport/hsta/dragon-hsta
src/lib/message_defs.capnp.c++
src/include/dragon/message_defs.capnp.h
test/channels_subtests/test.out
src/dragon/infrastructure/message_defs.capnp
src/include/dragon/message_tcs.hpp
test/native/flimsgfrom
test/native/flimsgto
src/lib/_message_tcs.hpp
src/.dragon-config.mk
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "external/capnproto"]
path = external/capnproto
url = https://github.com/capnproto/capnproto.git
47 changes: 5 additions & 42 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,49 +65,12 @@ If you wish to run multi-node or don't want to run in a container, you must set
up your environment to run Dragon programs. Choose the version of Dragon to
download that goes with your installed version of Python. Python 3.9+ is required
to run Dragon. You must have Python installed and it must be in your path
somewhere. A common choice is to use a Python virtual environment, which can be
initialized from a base Python with:
somewhere.

.. code-block:: console
python3 -m venv --clear _env
. _env/bin/activate
The untarred distribution file contains several subdirectories. All provided commands
are relative to the directory that contains the README.rst.

* The `dragon-*.whl` file must be pip3 installed once for your environment.

.. code-block:: console
pip3 install --force-reinstall dragon-0.8-cp39-cp39-linux_x86_64.whl
* Check and possibly update your `PATH` environment variable to include the location of
pip installed console scripts, such as ~/.local/bin if you're not using a virtual environment.

.. code-block:: console
export PATH=~/.local/bin:${PATH}
* You must set up the environment by loading the dragon module as follows.

.. code-block:: console
module use [/path to dragon-0.8]/modulefiles
module load dragon
If you intend to use Dragon on your own Linux VM or an image that you
personally installed, you may need to enable module commands by adding the
following command to your ~/.bashrc or other login script.

.. code-block:: console
source /usr/share/modules/init/bash
If you use a different shell, look in the `init` directory for a script for
your shell.

You have completed the prerequisites for running Dragon with multiprocessing programs.
The untarred distribution file contains several subdirectories. Run the
./dragon-install file in that root directory to create a python virtual
environment and install two wheel files. For further details, follow the
instructions that you find in that README.md file in the distribution directory.

Running Dragon
==============
Expand Down
10 changes: 5 additions & 5 deletions doc/cbook/ai-in-the-loop.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ The code of the other files can be found in the release package, inside `example
from itertools import count
from model import Net, make_features, infer, train
from dragon.native.process import Process, TemplateProcess, Popen
from dragon.native.process import Process, ProcessTemplate, Popen
from dragon.native.process_group import ProcessGroup
from dragon.infrastructure.connection import Connection
from dragon.native.machine import System
Expand Down Expand Up @@ -112,12 +112,12 @@ The code of the other files can be found in the release package, inside `example
grp = ProcessGroup(restart=False, pmi_enabled=True)
# Pipe the stdout output from the head process to a Dragon connection
grp.add_process(nproc=1, template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE))
grp.add_process(nproc=1, template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE))
# All other ranks should have their output go to DEVNULL
grp.add_process(
nproc=num_ranks - 1,
template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL),
template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL),
)
# start the process group
grp.init()
Expand Down Expand Up @@ -153,12 +153,12 @@ The code of the other files can be found in the release package, inside `example
grp = ProcessGroup(restart=False, pmi_enabled=True)
# Pipe the stdout output from the head process to a Dragon connection
grp.add_process(nproc=1, template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE))
grp.add_process(nproc=1, template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE))
# All other ranks should have their output go to DEVNULL
grp.add_process(
nproc=num_ranks - 1,
template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL),
template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL),
)
# start the process group
grp.init()
Expand Down
6 changes: 3 additions & 3 deletions doc/cbook/basic_pandarallel_demo.rst
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Basic Pandarallel Demonstration for Single Node Environment
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

This Jupyter benchmark is a simple use case for the pandarallel `parallel_apply` call.
It can be run with `dragon` and base multiprocessing to compare performance on your machine.
This Jupyter benchmark is a simple use case for the pandarallel `parallel_apply` call.
It can be run with `dragon` and base multiprocessing to compare performance on your machine.

The program demonstrates how to use `parallel_apply`, the multiprocessing verison of pandas `apply`, on a pandas dataframe with random input.

Expand All @@ -12,4 +12,4 @@ The code demonstrates the following key concepts working with Dragon:
* How to use pandarallel and pandas with Dragon and base multiprocessing
* How pandarallel handles various dtypes

.. literalinclude:: ../../examples/jupyter/doc_ref/basic_pandarallel_demo.py
.. literalinclude:: ../../examples/jupyter/doc_ref/basic_pandarallel_demo.py
10 changes: 5 additions & 5 deletions doc/cbook/bioinfo_alignment_pandarallel_demo.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ The following notebook was used for the single-node comparison:
.. literalinclude:: ../../examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_demo.py

For the single-node run, both base multiprocessing and Dragon are compared. The runs utilized a single node with 2 AMD EPYC 7742 64-Core Processors with 128 cores.
Dragon employs a number of optimizations on base multiprocessing; the Dragon start method outperforms the use of the base multiprocessing spawn start method on the same hardware.
Dragon employs a number of optimizations on base multiprocessing; the Dragon start method outperforms the use of the base multiprocessing spawn start method on the same hardware.

The timing for the base multiprocessing runtime is:

Expand Down Expand Up @@ -71,10 +71,10 @@ The timing for the single-node Dragon runtime is:
-
- 27.174203

For multi-node Dragon run, the run was on 2 Apollo nodes. Each Apollo node has 1x AMD Rome CPU with 4x AMD MI100 GPUs and 128 cores.
The multi-node use case scales with the total number of CPUs reported by the allocation. As there are more nodes, workers, and CPUs available for multi-node, Dragon extends
multiprocessing's stock capabilities and demonstrates additional improvement to measured execution time.
Base multiprocessing does not support multi-node workloads.
For multi-node Dragon run, the run was on 2 Apollo nodes. Each Apollo node has 1x AMD Rome CPU with 4x AMD MI100 GPUs and 128 cores.
The multi-node use case scales with the total number of CPUs reported by the allocation. As there are more nodes, workers, and CPUs available for multi-node, Dragon extends
multiprocessing's stock capabilities and demonstrates additional improvement to measured execution time.
Base multiprocessing does not support multi-node workloads.

The following notebook was used for the multi-node comparison:

Expand Down
9 changes: 4 additions & 5 deletions doc/cbook/c_channels_demo.rst
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ captured by Dragon, any error messages are displayed back to the user.
dragonMessage_t msg;
char* send_ser_encoded;
char* final_ser_encoded;
size_t send_ser_len;
/* This function is necessary for off-node communication and relies on the
* Dragon run-time services to supply gateway channels in the
Expand Down Expand Up @@ -232,7 +231,7 @@ captured by Dragon, any error messages are displayed back to the user.
* Dragon provides both base64 encoding and decoding for
* interoperability between languages. */
recv_chser.data = dragon_base64_decode(argv[3], strlen(argv[3]), &recv_chser.len);
recv_chser.data = dragon_base64_decode(argv[3], &recv_chser.len);
/* With a valid serialized descriptor you can attach to a channel. This
* attach here occurs on an off-node channel (except in the one node
Expand Down Expand Up @@ -317,7 +316,7 @@ captured by Dragon, any error messages are displayed back to the user.
return -1;
}
send_ser_encoded = dragon_base64_encode(send_chser.data, send_chser.len, &send_ser_len);
send_ser_encoded = dragon_base64_encode(send_chser.data, send_chser.len);
err = dragon_memory_pool_detach(&pool_descr);
if (err != DRAGON_SUCCESS) {
Expand All @@ -340,7 +339,7 @@ captured by Dragon, any error messages are displayed back to the user.
send_ser_encoded = argv[4];
final_ser_encoded = argv[5];
send_chser.data = dragon_base64_decode(send_ser_encoded, strlen(send_ser_encoded), &send_chser.len);
send_chser.data = dragon_base64_decode(send_ser_encoded, &send_chser.len);
err = dragon_channel_attach(&send_chser, &send_ch);
if (err != DRAGON_SUCCESS) {
Expand All @@ -355,7 +354,7 @@ captured by Dragon, any error messages are displayed back to the user.
return -1;
}
final_chser.data = dragon_base64_decode(final_ser_encoded, strlen(final_ser_encoded), &final_chser.len);
final_chser.data = dragon_base64_decode(final_ser_encoded, &final_chser.len);
err = dragon_channel_attach(&final_chser, &final_ch);
if (err != DRAGON_SUCCESS) {
Expand Down
1 change: 1 addition & 0 deletions doc/cbook/cbook.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Dragon Native
c_channels_demo.rst
dragon_native_queue.rst
dragon_mpi_workflow.rst
dragon_native_policy_demo.rst

Dragon Data (Preview)
=====================
Expand Down
4 changes: 2 additions & 2 deletions doc/cbook/distr-inf-telemetry.rst
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ in :numref:`single-prompt-response`.
.. figure:: images/llm-grafana-telem-data.jpg
:scale: 60%
:name: node-telemetry

**Node telemetry data that is visualized using Grafana GUI and highlights the load balanced nature of this example**


Expand Down Expand Up @@ -354,4 +354,4 @@ Description of the system used
==============================

For this example, an HPE Cray EX was used. Each node has AMD EPYC 7763 64-core
CPUs and 4x Nvidia A100 GPUs.
CPUs and 4x Nvidia A100 GPUs.
2 changes: 1 addition & 1 deletion doc/cbook/dragon_dict.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,6 @@ aggregated rate of opearations as the dictionary managers are spawned across the
.. figure:: images/dragon_dict_results.png
:align: center
:scale: 25%
:name: multinode-results
:name: multinode-results

**Results on a multi-node setup**
6 changes: 3 additions & 3 deletions doc/cbook/dragon_mpi_workflow.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ processes:
from dragon.globalservices import node
from dragon.globalservices.process import multi_join
from dragon.infrastructure.connection import Connection
from dragon.native.process import MSG_PIPE, MSG_DEVNULL, Process, TemplateProcess
from dragon.native.process import MSG_PIPE, MSG_DEVNULL, Process, ProcessTemplate
from dragon.native.process_group import ProcessGroup
logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -81,13 +81,13 @@ processes:
# Pipe the stdout output from the head process to a Dragon connection
grp.add_process(
nproc=1,
template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=MSG_PIPE)
template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=MSG_PIPE)
)
# All other ranks should have their output go to DEVNULL
grp.add_process(
nproc=num_ranks-1,
template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=MSG_DEVNULL)
template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=MSG_DEVNULL)
)
grp.init()
Expand Down
Loading

0 comments on commit c608fba

Please sign in to comment.