Merge pull request #13 from DragonHPC/version-0.9

Version 0.9
DragonHPC · May 3, 2024 · c608fba · c608fba
2 parents ca9f372 + 215a57a
commit c608fba
Show file tree

Hide file tree

Showing 270 changed files with 23,066 additions and 3,044 deletions.
diff --git a/.devcontainer/library-scripts/common-debian.sh b/.devcontainer/library-scripts/common-debian.sh
@@ -112,8 +112,9 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then
         strace \
         manpages \
         manpages-dev \
-        init-system-helpers"
-
+        init-system-helpers \
+        capnproto"
+
     # Needed for adding manpages-posix and manpages-posix-dev which are non-free packages in Debian
     if [ "${ADD_NON_FREE_PACKAGES}" = "true" ]; then
         # Bring in variables from /etc/os-release like VERSION_CODENAME
@@ -124,7 +125,7 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then
         sed -i -E "s/deb-src http:\/\/(deb|httpredir)\.debian\.org\/debian ${VERSION_CODENAME}-updates main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME}-updates main contrib non-free/" /etc/apt/sources.list
         sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list
         sed -i "s/deb-src http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list
-        sed -i "s/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list 
+        sed -i "s/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list
         sed -i "s/deb-src http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list
         # Handle bullseye location for security https://www.debian.org/releases/bullseye/amd64/release-notes/ch-information.en.html
         sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main contrib non-free/" /etc/apt/sources.list
@@ -140,7 +141,7 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then
     if [[ ! -z $(apt-cache --names-only search ^libssl1.1$) ]]; then
         package_list="${package_list}       libssl1.1"
     fi
-    
+
     # Install appropriate version of libssl1.0.x if available
     libssl_package=$(dpkg-query -f '${db:Status-Abbrev}\t${binary:Package}\n' -W 'libssl1\.0\.?' 2>&1 || echo '')
     if [ "$(echo "$LIlibssl_packageBSSL" | grep -o 'libssl1\.0\.[0-9]:' | uniq | sort | wc -l)" -eq 0 ]; then
@@ -155,7 +156,7 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then
 
     echo "Packages to verify are installed: ${package_list}"
     apt-get -y install --no-install-recommends ${package_list} 2> >( grep -v 'debconf: delaying package configuration, since apt-utils is not installed' >&2 )
-        
+
     # Install git if not already installed (may be more recent than distro version)
     if ! type git > /dev/null 2>&1; then
         apt-get -y install --no-install-recommends git
@@ -174,7 +175,7 @@ fi
 # Ensure at least the en_US.UTF-8 UTF-8 locale is available.
 # Common need for both applications and things like the agnoster ZSH theme.
 if [ "${LOCALE_ALREADY_SET}" != "true" ] && ! grep -o -E '^\s*en_US.UTF-8\s+UTF-8' /etc/locale.gen > /dev/null; then
-    echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen 
+    echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen
     locale-gen
     LOCALE_ALREADY_SET="true"
 fi
@@ -183,12 +184,12 @@ fi
 group_name="${USERNAME}"
 if id -u ${USERNAME} > /dev/null 2>&1; then
     # User exists, update if needed
-    if [ "${USER_GID}" != "automatic" ] && [ "$USER_GID" != "$(id -g $USERNAME)" ]; then 
+    if [ "${USER_GID}" != "automatic" ] && [ "$USER_GID" != "$(id -g $USERNAME)" ]; then
         group_name="$(id -gn $USERNAME)"
         groupmod --gid $USER_GID ${group_name}
         usermod --gid $USER_GID $USERNAME
     fi
-    if [ "${USER_UID}" != "automatic" ] && [ "$USER_UID" != "$(id -u $USERNAME)" ]; then 
+    if [ "${USER_UID}" != "automatic" ] && [ "$USER_UID" != "$(id -u $USERNAME)" ]; then
         usermod --uid $USER_UID $USERNAME
     fi
 else
@@ -198,7 +199,7 @@ else
     else
         groupadd --gid $USER_GID $USERNAME
     fi
-    if [ "${USER_UID}" = "automatic" ]; then 
+    if [ "${USER_UID}" = "automatic" ]; then
         useradd -s /bin/bash --gid $USERNAME -m $USERNAME
     else
         useradd -s /bin/bash --uid $USER_UID --gid $USERNAME -m $USERNAME
@@ -213,7 +214,7 @@ if [ "${USERNAME}" != "root" ] && [ "${EXISTING_NON_ROOT_USER}" != "${USERNAME}"
 fi
 
 # ** Shell customization section **
-if [ "${USERNAME}" = "root" ]; then 
+if [ "${USERNAME}" = "root" ]; then
     user_rc_path="/root"
 else
     user_rc_path="/home/${USERNAME}"
@@ -250,9 +251,9 @@ fi
 # Set the default git editor if not already set
 if [ -z "$(git config --get core.editor)" ] && [ -z "${GIT_EDITOR}" ]; then
     if  [ "${TERM_PROGRAM}" = "vscode" ]; then
-        if [[ -n $(command -v code-insiders) &&  -z $(command -v code) ]]; then 
+        if [[ -n $(command -v code-insiders) &&  -z $(command -v code) ]]; then
             export GIT_EDITOR="code-insiders --wait"
-        else 
+        else
             export GIT_EDITOR="code --wait"
         fi
     fi
@@ -329,7 +330,7 @@ codespaces_zsh="$(cat \
 # Codespaces zsh prompt theme
 __zsh_prompt() {
     local prompt_username
-    if [ ! -z "${GITHUB_USER}" ]; then 
+    if [ ! -z "${GITHUB_USER}" ]; then
         prompt_username="@${GITHUB_USER}"
     else
         prompt_username="%n"

diff --git a/.gitignore b/.gitignore
@@ -34,9 +34,14 @@ doc/internal/services/images/shepherd.png
 doc/internal/services/transport_agent/images/transport_agent.png
 doc/plantuml.jar
 external/external_deps/
+external/capnproto
+external/pycapnp
+src/release
+src/include/capnp
+src/include/kj
+src/lib/message_defs.capnp.h
 src/*egg*
-src/bin/dragon_hsta
-src/build/dragon_hsta
+src/bin/dragon-hsta
 src/dist/
 src/doxygen/
 src/dragon/dlogging/pydragon_logging.c
@@ -47,11 +52,16 @@ src/dragon/launcher/pydragon_pmsgqueue.c
 src/dragon/pydragon_*.c
 src/dragon/transport/hsta/__init__.c
 src/dragon/transport/hsta/__main__.c
-src/dragon/transport/hsta/crash-*
 src/dragon/transport/hsta/dragon_hsta
-src/dragon/transport/hsta/hsta_dbg.*.out
-src/dragon/transport/hsta/ideas.txt
-src/dragon/transport/hsta/leak-*
 src/include/dragon/return_codes_map.h
 src/bin
 src/dragon/transport/hsta/dragon-hsta
+src/lib/message_defs.capnp.c++
+src/include/dragon/message_defs.capnp.h
+test/channels_subtests/test.out
+src/dragon/infrastructure/message_defs.capnp
+src/include/dragon/message_tcs.hpp
+test/native/flimsgfrom
+test/native/flimsgto
+src/lib/_message_tcs.hpp
+src/.dragon-config.mk
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "external/capnproto"]
+	path = external/capnproto
+	url = https://github.com/capnproto/capnproto.git
diff --git a/README.rst b/README.rst
@@ -65,49 +65,12 @@ If you wish to run multi-node or don't want to run in a container, you must set
 up your environment to run Dragon programs. Choose the version of Dragon to
 download that goes with your installed version of Python. Python 3.9+ is required
 to run Dragon. You must have Python installed and it must be in your path
-somewhere. A common choice is to use a Python virtual environment, which can be
-initialized from a base Python with:
+somewhere.
 
-.. code-block:: console
-
-    python3 -m venv --clear _env
-    . _env/bin/activate
-
-The untarred distribution file contains several subdirectories. All provided commands
-are relative to the directory that contains the README.rst.
-
-* The `dragon-*.whl` file must be pip3 installed once for your environment.
-
-.. code-block:: console
-
-    pip3 install --force-reinstall dragon-0.8-cp39-cp39-linux_x86_64.whl
-
-* Check and possibly update your `PATH` environment variable to include the location of
-  pip installed console scripts, such as ~/.local/bin if you're not using a virtual environment.
-
-.. code-block:: console
-
-    export PATH=~/.local/bin:${PATH}
-
-* You must set up the environment by loading the dragon module as follows.
-
-.. code-block:: console
-
-    module use [/path to dragon-0.8]/modulefiles
-    module load dragon
-
-If you intend to use Dragon on your own Linux VM or an image that you
-personally installed, you may need to enable module commands by adding the
-following command to your ~/.bashrc or other login script.
-
-.. code-block:: console
-
-    source /usr/share/modules/init/bash
-
-If you use a different shell, look in the `init` directory for a script for
-your shell.
-
-You have completed the prerequisites for running Dragon with multiprocessing programs.
+The untarred distribution file contains several subdirectories. Run the
+./dragon-install file in that root directory to create a python virtual
+environment and install two wheel files. For further details, follow the
+instructions that you find in that README.md file in the distribution directory.
 
 Running Dragon
 ==============

diff --git a/doc/cbook/ai-in-the-loop.rst b/doc/cbook/ai-in-the-loop.rst
@@ -49,7 +49,7 @@ The code of the other files can be found in the release package, inside `example
     from itertools import count
     from model import Net, make_features, infer, train
 
-    from dragon.native.process import Process, TemplateProcess, Popen
+    from dragon.native.process import Process, ProcessTemplate, Popen
     from dragon.native.process_group import ProcessGroup
     from dragon.infrastructure.connection import Connection
     from dragon.native.machine import System
@@ -112,12 +112,12 @@ The code of the other files can be found in the release package, inside `example
         grp = ProcessGroup(restart=False, pmi_enabled=True)
 
         # Pipe the stdout output from the head process to a Dragon connection
-        grp.add_process(nproc=1, template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE))
+        grp.add_process(nproc=1, template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE))
 
         # All other ranks should have their output go to DEVNULL
         grp.add_process(
             nproc=num_ranks - 1,
-            template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL),
+            template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL),
         )
         # start the process group
         grp.init()
@@ -153,12 +153,12 @@ The code of the other files can be found in the release package, inside `example
         grp = ProcessGroup(restart=False, pmi_enabled=True)
 
         # Pipe the stdout output from the head process to a Dragon connection
-        grp.add_process(nproc=1, template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE))
+        grp.add_process(nproc=1, template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE))
 
         # All other ranks should have their output go to DEVNULL
         grp.add_process(
             nproc=num_ranks - 1,
-            template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL),
+            template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL),
         )
         # start the process group
         grp.init()

diff --git a/doc/cbook/basic_pandarallel_demo.rst b/doc/cbook/basic_pandarallel_demo.rst
@@ -1,8 +1,8 @@
 Basic Pandarallel Demonstration for Single Node Environment
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-This Jupyter benchmark is a simple use case for the pandarallel `parallel_apply` call.
-It can be run with `dragon` and base multiprocessing to compare performance on your machine.
+This Jupyter benchmark is a simple use case for the pandarallel `parallel_apply` call. 
+It can be run with `dragon` and base multiprocessing to compare performance on your machine. 
 
 The program demonstrates how to use `parallel_apply`, the multiprocessing verison of pandas `apply`, on a pandas dataframe with random input.
 
@@ -12,4 +12,4 @@ The code demonstrates the following key concepts working with Dragon:
 * How to use pandarallel and pandas with Dragon and base multiprocessing
 * How pandarallel handles various dtypes
 
-.. literalinclude:: ../../examples/jupyter/doc_ref/basic_pandarallel_demo.py
+.. literalinclude:: ../../examples/jupyter/doc_ref/basic_pandarallel_demo.py
diff --git a/doc/cbook/bioinfo_alignment_pandarallel_demo.rst b/doc/cbook/bioinfo_alignment_pandarallel_demo.rst
@@ -24,7 +24,7 @@ The following notebook was used for the single-node comparison:
 .. literalinclude:: ../../examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_demo.py
 
 For the single-node run, both base multiprocessing and Dragon are compared. The runs utilized a single node with 2 AMD EPYC 7742 64-Core Processors with 128 cores.
-Dragon employs a number of optimizations on base multiprocessing; the Dragon start method outperforms the use of the base multiprocessing spawn start method on the same hardware.
+Dragon employs a number of optimizations on base multiprocessing; the Dragon start method outperforms the use of the base multiprocessing spawn start method on the same hardware. 
 
 The timing for the base multiprocessing runtime is:
 
@@ -71,10 +71,10 @@ The timing for the single-node Dragon runtime is:
      -
      - 27.174203
 
-For multi-node Dragon run, the run was on 2 Apollo nodes. Each Apollo node has 1x AMD Rome CPU with 4x AMD MI100 GPUs and 128 cores.
-The multi-node use case scales with the total number of CPUs reported by the allocation. As there are more nodes, workers, and CPUs available for multi-node, Dragon extends
-multiprocessing's stock capabilities and demonstrates additional improvement to measured execution time.
-Base multiprocessing does not support multi-node workloads.
+For multi-node Dragon run, the run was on 2 Apollo nodes. Each Apollo node has 1x AMD Rome CPU with 4x AMD MI100 GPUs and 128 cores. 
+The multi-node use case scales with the total number of CPUs reported by the allocation. As there are more nodes, workers, and CPUs available for multi-node, Dragon extends 
+multiprocessing's stock capabilities and demonstrates additional improvement to measured execution time. 
+Base multiprocessing does not support multi-node workloads. 
 
 The following notebook was used for the multi-node comparison:
 

diff --git a/doc/cbook/c_channels_demo.rst b/doc/cbook/c_channels_demo.rst
@@ -200,7 +200,6 @@ captured by Dragon, any error messages are displayed back to the user.
         dragonMessage_t msg;
         char* send_ser_encoded;
         char* final_ser_encoded;
-        size_t send_ser_len;
 
         /* This function is necessary for off-node communication and relies on the
         * Dragon run-time services to supply gateway channels in the
@@ -232,7 +231,7 @@ captured by Dragon, any error messages are displayed back to the user.
         * Dragon provides both base64 encoding and decoding for
         * interoperability between languages. */
 
-        recv_chser.data = dragon_base64_decode(argv[3], strlen(argv[3]), &recv_chser.len);
+        recv_chser.data = dragon_base64_decode(argv[3], &recv_chser.len);
 
         /* With a valid serialized descriptor you can attach to a channel. This
         * attach here occurs on an off-node channel (except in the one node
@@ -317,7 +316,7 @@ captured by Dragon, any error messages are displayed back to the user.
                 return -1;
             }
 
-            send_ser_encoded = dragon_base64_encode(send_chser.data, send_chser.len, &send_ser_len);
+            send_ser_encoded = dragon_base64_encode(send_chser.data, send_chser.len);
 
             err = dragon_memory_pool_detach(&pool_descr);
             if (err != DRAGON_SUCCESS) {
@@ -340,7 +339,7 @@ captured by Dragon, any error messages are displayed back to the user.
             send_ser_encoded = argv[4];
             final_ser_encoded = argv[5];
 
-            send_chser.data = dragon_base64_decode(send_ser_encoded, strlen(send_ser_encoded), &send_chser.len);
+            send_chser.data = dragon_base64_decode(send_ser_encoded, &send_chser.len);
 
             err = dragon_channel_attach(&send_chser, &send_ch);
             if (err != DRAGON_SUCCESS) {
@@ -355,7 +354,7 @@ captured by Dragon, any error messages are displayed back to the user.
                 return -1;
             }
 
-            final_chser.data = dragon_base64_decode(final_ser_encoded, strlen(final_ser_encoded), &final_chser.len);
+            final_chser.data = dragon_base64_decode(final_ser_encoded, &final_chser.len);
 
             err = dragon_channel_attach(&final_chser, &final_ch);
             if (err != DRAGON_SUCCESS) {

diff --git a/doc/cbook/cbook.rst b/doc/cbook/cbook.rst
@@ -41,6 +41,7 @@ Dragon Native
    c_channels_demo.rst
    dragon_native_queue.rst
    dragon_mpi_workflow.rst
+   dragon_native_policy_demo.rst
 
 Dragon Data (Preview)
 =====================

diff --git a/doc/cbook/distr-inf-telemetry.rst b/doc/cbook/distr-inf-telemetry.rst
@@ -211,7 +211,7 @@ in :numref:`single-prompt-response`.
 .. figure:: images/llm-grafana-telem-data.jpg
     :scale: 60%
     :name: node-telemetry
-
+    
     **Node telemetry data that is visualized using Grafana GUI and highlights the load balanced nature of this example**
 
 
@@ -354,4 +354,4 @@ Description of the system used
 ==============================
 
 For this example, an HPE Cray EX was used. Each node has AMD EPYC 7763 64-core
-CPUs and 4x Nvidia A100 GPUs.
+CPUs and 4x Nvidia A100 GPUs.
diff --git a/doc/cbook/dragon_dict.rst b/doc/cbook/dragon_dict.rst
@@ -99,6 +99,6 @@ aggregated rate of opearations as the dictionary managers are spawned across the
 .. figure:: images/dragon_dict_results.png
     :align: center
     :scale: 25%
-    :name: multinode-results
+    :name: multinode-results 
 
     **Results on a multi-node setup**
diff --git a/doc/cbook/dragon_mpi_workflow.rst b/doc/cbook/dragon_mpi_workflow.rst
@@ -47,7 +47,7 @@ processes:
     from dragon.globalservices import node
     from dragon.globalservices.process import multi_join
     from dragon.infrastructure.connection import Connection
-    from dragon.native.process import MSG_PIPE, MSG_DEVNULL, Process, TemplateProcess
+    from dragon.native.process import MSG_PIPE, MSG_DEVNULL, Process, ProcessTemplate
     from dragon.native.process_group import ProcessGroup
 
     logging.basicConfig(level=logging.INFO)
@@ -81,13 +81,13 @@ processes:
         # Pipe the stdout output from the head process to a Dragon connection
         grp.add_process(
             nproc=1,
-            template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=MSG_PIPE)
+            template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=MSG_PIPE)
         )
 
         # All other ranks should have their output go to DEVNULL
         grp.add_process(
             nproc=num_ranks-1,
-            template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=MSG_DEVNULL)
+            template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=MSG_DEVNULL)
         )
 
         grp.init()